In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
)

In [3]:
cols_needed = [
    "engine_displacement",
    "horsepower",
    "vehicle_weight",
    "model_year",
    "fuel_efficiency_mpg",
]
df = df[cols_needed]

# Question 1
## There's one column with missing values. What is it?

In [4]:
print(df.isna().sum())

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


# Question 2
## What's the median (50% percentile) for variable 'horsepower'?

In [15]:
print(df['horsepower'].median())

149.0


# Question 3
We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='fuel_efficiency_mpg')
y = df.fuel_efficiency_mpg


X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

print("train:", len(X_train),
      "val:", len(X_val),
      "test:", len(X_test))

train: 5822 val: 1941 test: 1941


In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

mean_hp = X_train.horsepower.mean()

def prepare_X(df, fill_value):
    df = df.copy()
    df.horsepower = df.horsepower.fillna(fill_value)
    return np.column_stack([np.ones(len(df)), df.values])

strategies = {0: "zero", mean_hp: "mean"}
results = {}

for val, name in strategies.items():
    X_tr = prepare_X(X_train, val)
    X_va = prepare_X(X_val, val)
    model = LinearRegression(fit_intercept=False)
    model.fit(X_tr, y_train)
    y_pred = model.predict(X_va)
    score  = rmse(y_val, y_pred)
    results[name] = round(score, 2)
    print(name, "RMSE:", score)

print(min(results, key=results.get))

zero RMSE: 0.517291936381243
mean RMSE: 0.46035791011349964
mean


In [26]:
rs = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = {}

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X) + (r + 1e-8) * np.eye(X.shape[1])
    w_full = np.linalg.inv(XTX).dot(X.T).dot(y)
    return w_full[0], w_full[1:]

def prepare_X(df, fill_value=0):
    df = df.copy()
    df['horsepower'] = df['horsepower'].fillna(fill_value)
    return np.column_stack([np.ones(len(df)), df.values])

X_train_z = prepare_X(X_train)
X_val_z   = prepare_X(X_val)

for r in rs:
    w0, w = train_linear_regression_reg(X_train_z, y_train, r=r)
    y_pred = X_val_z.dot(w) + w0
    score  = np.sqrt(np.mean((y_val - y_pred)**2))
    scores[r] = round(score, 2)
    print(f"r={r:>5}  RMSE={scores[r]}")

best_r = min(scores, key=scores.get)

# -----------------------------------------

r=    0  RMSE=0.52
r= 0.01  RMSE=0.52
r=  0.1  RMSE=0.52
r=    1  RMSE=0.52
r=    5  RMSE=0.53
r=   10  RMSE=0.53
r=  100  RMSE=0.53


In [30]:
seeds = range(10)
rmse_scores = []

for seed in seeds:
    X_tr, X_temp, y_tr, y_temp = train_test_split(
        X, y, test_size=0.4, random_state=seed)
    X_va, X_te, y_va, y_te = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=seed)
    
    X_tr_z = prepare_X(X_tr)
    X_va_z = prepare_X(X_va)
    

    w0, w = train_linear_regression_reg(X_tr_z, y_tr, r=0)
    y_pred = X_va_z.dot(w) + w0
    
    rmse_scores.append(np.sqrt(np.mean((y_va - y_pred)**2)))

std_rmse = round(np.std(rmse_scores), 3)
print("RMSE list:", [round(s, 3) for s in rmse_scores])
print("Std:", std_rmse)

RMSE list: [np.float64(0.518), np.float64(0.517), np.float64(0.52), np.float64(0.522), np.float64(0.511), np.float64(0.515), np.float64(0.515), np.float64(0.536), np.float64(0.519), np.float64(0.52)]
Std: 0.006


In [29]:
# ----------  Q6  final test RMSE ----------
np.random.seed(9)                       # lock seed 9 as required
X_full = pd.concat([X_train, X_val])    # merge train + val
y_full = pd.concat([y_train, y_val])

# impute 0, build matrix
X_full_z = prepare_X(X_full)
X_test_z = prepare_X(X_test)

# train with r = 0.001
w0, w = train_linear_regression_reg(X_full_z, y_full, r=0.001)

# predict on TEST set
y_pred = X_test_z.dot(w) + w0
test_rmse = round(np.sqrt(np.mean((y_test - y_pred)**2)), 3)

print("Test RMSE:", test_rmse)

Test RMSE: 0.523
