In [20]:
import pandas as pd
import numpy as np

In [1]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
!wget $data

--2025-10-05 22:59:17--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-05 22:59:18 (5.09 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [100]:
df = pd.read_csv('car_fuel_efficiency.csv')
df = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [15]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [16]:
df['horsepower'].median()

149.0

In [17]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
(n, n_val, n_test, n_train, n_val + n_test + n_train)

(9704, 1940, 1940, 5824, 9704)

In [72]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [50]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [59]:
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'
hp_mean = df_train['horsepower'].mean()
df_zero_train = df_train.copy().fillna(0)
df_mean_train = df_train.copy().fillna(hp_mean)
df_zero_val = df_val.copy().fillna(0)
df_mean_val = df_val.copy().fillna(hp_mean)

In [61]:
X_zero_train = df_zero_train[features]
X_mean_train = df_mean_train[features]
X_zero_val = df_zero_val[features]
X_mean_val = df_mean_val[features]
y_train = df_train[target]
y_val= df_val[target]

In [73]:
w0_zero, w_zero = train_linear_regression(X_zero_train, y)
w0_mean, w_mean = train_linear_regression(X_mean_train, y)

y_zero_pred = w0_zero + X_zero_val.dot(w_zero)
y_mean_pred = w0_mean + X_mean_val.dot(w_zero)

In [74]:
print(rmse(y_val, y_zero_pred), rmse(y_val, y_mean_pred))

0.5173782638837691 0.5201474583067438


In [77]:
reg = [0, 0.01, 0.1, 1, 5, 10, 100]
def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [76]:
rmse_dict = {}
for r in reg:
    w0, w = train_linear_regression_reg(X_zero_train, y_train, r)
    y_pred = w0 + X_zero_val.dot(w)
    err = round(rmse(y_val, y_pred), 0)
    print(f"r: {r}, RMSE: {err}")
    rmse_dict[r] = err

print(min(rmse_dict, key=rmse_dict.get))

r: 0, RMSE: 1.0
r: 0.01, RMSE: 1.0
r: 0.1, RMSE: 1.0
r: 1, RMSE: 1.0
r: 5, RMSE: 1.0
r: 10, RMSE: 1.0
r: 100, RMSE: 1.0
0


In [101]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmses = []
for seed in seeds:
    idxs = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idxs)
    df_trains = df.iloc[idxs[:n_train]].fillna(0)
    df_vals = df.iloc[idxs[n_train:n_train+n_val]].fillna(0)
    df_tests = df.iloc[idxs[n_train+n_val:]].fillna(0)
    df_trains = df_trains.reset_index(drop=True)
    df_vals = df_vals.reset_index(drop=True)
    df_tests = df_tests.reset_index(drop=True)

    X_trains = df_trains[features]
    X_vals = df_vals[features]
    y_trains = df_trains[target]
    y_vals= df_vals[target]

    w0s, ws = train_linear_regression_reg(X_trains, y_trains, 0)
    y_preds = w0s + X_vals.dot(ws)
    rmses.append(rmse(y_preds, y_vals))
rmses = np.array(rmses)
rmses

array([0.52065313, 0.52133889, 0.522807  , 0.51595167, 0.51091295,
       0.52834065, 0.53139107, 0.50906704, 0.51473991, 0.51318659])

In [102]:
round(rmses.std(),3)

np.float64(0.007)

In [104]:
idxs = np.arange(n)
np.random.seed(9)
np.random.shuffle(idxs)
df_full_trains = df.iloc[idxs[:n_train+n_val]].fillna(0)
df_tests = df.iloc[idxs[n_train+n_val:]].fillna(0)
df_full_trains = df_trains.reset_index(drop=True)
df_tests = df_tests.reset_index(drop=True)
X_full_trains = df_full_trains[features]
X_tests = df_tests[features]
y_full_trains = df_full_trains[target]
y_tests= df_tests[target]
w0s, ws = train_linear_regression_reg(X_full_trains, y_full_trains, 0)
y_preds = w0s + X_tests.dot(ws)
err = rmse(y_preds, y_tests)
err



np.float64(0.5158284328944308)