In [2]:
import requests
import numpy as np
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
response = requests.get(url)
with open("car_fuel_efficiency.csv", "wb") as f:
    f.write(response.content)

In [3]:
import pandas as pd
data = pd.read_csv("car_fuel_efficiency.csv")

In [4]:
data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [5]:
data = data[['engine_displacement', 'horsepower', 'vehicle_weight', 'fuel_efficiency_mpg']]

In [6]:
data.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,fuel_efficiency_mpg
0,170,159.0,3413.433759,13.231729
1,130,97.0,3149.664934,13.688217
2,170,78.0,3079.038997,14.246341
3,220,,2542.392402,16.912736
4,210,140.0,3460.87099,12.488369


In [7]:
data.describe()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,fuel_efficiency_mpg
count,9704.0,8996.0,9704.0,9704.0
mean,199.708368,149.657292,3001.280993,14.985243
std,49.455319,29.879555,497.89486,2.556468
min,10.0,37.0,952.681761,6.200971
25%,170.0,130.0,2666.248985,13.267459
50%,200.0,149.0,2993.226296,15.006037
75%,230.0,170.0,3334.957039,16.707965
max,380.0,271.0,4739.077089,25.967222


In [8]:
for col in data.columns:
    print(f"{col}: {data[col].isnull().sum()} null values")
    

engine_displacement: 0 null values
horsepower: 708 null values
vehicle_weight: 0 null values
fuel_efficiency_mpg: 0 null values


50% of horsepower is 149.000000, check above

In [9]:
df = data[['engine_displacement', 'horsepower', 'vehicle_weight', 'fuel_efficiency_mpg']]


In [10]:
np.random.seed(42)
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

In [11]:
df_train_0 = df_train.copy()
df_val_0 = df_val.copy()

df_train_0['horsepower'] = df_train_0['horsepower'].fillna(0)
df_val_0['horsepower'] = df_val_0['horsepower'].fillna(0)

# Option B: fill horsepower with mean computed from training only
mean_hp = df_train['horsepower'].mean()
df_train_mean = df_train.copy()
df_val_mean = df_val.copy()

df_train_mean['horsepower'] = df_train_mean['horsepower'].fillna(mean_hp)
df_val_mean['horsepower'] = df_val_mean['horsepower'].fillna(mean_hp)

In [12]:
def train_linear_regression(X, y):
    Xb = np.column_stack([np.ones(X.shape[0]), X])
    w = np.linalg.pinv(Xb).dot(y)
    return w

def predict(X, w):
    Xb = np.column_stack([np.ones(X.shape[0]), X])
    return Xb.dot(w)

def rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred) ** 2))

In [13]:
X_train_0 = df_train_0.drop('fuel_efficiency_mpg', axis=1).values
y_train_0 = np.log1p(df_train_0['fuel_efficiency_mpg'].values)
X_val_0 = df_val_0.drop('fuel_efficiency_mpg', axis=1).values
y_val_0 = np.log1p(df_val_0['fuel_efficiency_mpg'].values)

w0 = train_linear_regression(X_train_0, y_train_0)
y_pred_val_0 = predict(X_val_0, w0)
rmse_0 = rmse(y_val_0, y_pred_val_0)

In [14]:
X_train_mean = df_train_mean.drop('fuel_efficiency_mpg', axis=1).values
y_train_mean = np.log1p(df_train_mean['fuel_efficiency_mpg'].values)
X_val_mean = df_val_mean.drop('fuel_efficiency_mpg', axis=1).values
y_val_mean = np.log1p(df_val_mean['fuel_efficiency_mpg'].values)

w_mean = train_linear_regression(X_train_mean, y_train_mean)
y_pred_val_mean = predict(X_val_mean, w_mean)
rmse_mean = rmse(y_val_mean, y_pred_val_mean)

print("RMSE (fill 0):", round(rmse_0, 2))
print("RMSE (fill mean):", round(rmse_mean, 2))

RMSE (fill 0): 0.04
RMSE (fill mean): 0.04


Both are equally good


In [15]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [23]:
import numpy as np
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

df = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'fuel_efficiency_mpg']]



for r in [0, 0.01, 0.1, 1, 5, 10, 100]:

    seed = 42
    np.random.seed(seed)

    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

    # combine train and validation
    train_full = pd.concat([df_train, df_val], ignore_index=True)

    # impute horsepower with 0
    train_full['horsepower'] = train_full['horsepower'].fillna(0)
    df_test_0 = df_test.copy()
    df_test_0['horsepower'] = df_test_0['horsepower'].fillna(0)

    # prepare arrays (original target)
    X_train = train_full.drop('fuel_efficiency_mpg', axis=1).values
    y_train = train_full['fuel_efficiency_mpg'].values
    X_test = df_test_0.drop('fuel_efficiency_mpg', axis=1).values
    y_test = df_test_0['fuel_efficiency_mpg'].values

    ones = np.ones(X_train.shape[0])
    Xb = np.column_stack([ones, X_train])
    XTX = Xb.T.dot(Xb)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(Xb.T).dot(y_train)

    # predict on test
    Xb_test = np.column_stack([np.ones(X_test.shape[0]), X_test])
    y_pred_test = Xb_test.dot(w_full)

    rmse_test = np.sqrt(np.mean((y_test - y_pred_test) ** 2))
    print(round(rmse_test, 2))


0.52
0.52
0.52
0.52
0.54
0.59
1.73


In [18]:

seed = 9
np.random.seed(seed)

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

train_full = pd.concat([df_train, df_val], ignore_index=True)

train_full['horsepower'] = train_full['horsepower'].fillna(0)
df_test_0 = df_test.copy()
df_test_0['horsepower'] = df_test_0['horsepower'].fillna(0)

X_train = train_full.drop('fuel_efficiency_mpg', axis=1).values
y_train = train_full['fuel_efficiency_mpg'].values
X_test = df_test_0.drop('fuel_efficiency_mpg', axis=1).values
y_test = df_test_0['fuel_efficiency_mpg'].values

ones = np.ones(X_train.shape[0])
Xb = np.column_stack([ones, X_train])
XTX = Xb.T.dot(Xb)
XTX = XTX + 0.001 * np.eye(XTX.shape[0])
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(Xb.T).dot(y_train)

Xb_test = np.column_stack([np.ones(X_test.shape[0]), X_test])
y_pred_test = Xb_test.dot(w_full)

rmse_test = np.sqrt(np.mean((y_test - y_pred_test) ** 2))
print( round(rmse_test, 3))


0.516
