In [32]:

import pandas as pd
import numpy as np

url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols].copy()

def add_intercept(X: np.ndarray) -> np.ndarray:
    """Add an intercept column of ones to feature matrix."""
    return np.hstack([np.ones((X.shape[0], 1)), X])

def fit_linear_regression(X: np.ndarray, y: np.ndarray, r: float = 0.0) -> np.ndarray:
    """
    Fit linear regression (ridge if r>0) from scratch using normal equations.
    - Does NOT regularize the intercept term.
    Returns weight vector w (including intercept as w[0]).
    """
    Xb = add_intercept(X)  # (n, d+1)
    XtX = Xb.T @ Xb
    Xty = Xb.T @ y
    
    # Ridge: add r to diagonal except intercept
    if r > 0:
        reg = np.eye(XtX.shape[0]) * r
        reg[0, 0] = 0.0  # don't regularize intercept
        XtX = XtX + reg
    
    w = np.linalg.solve(XtX, Xty)
    return w

def predict(X: np.ndarray, w: np.ndarray) -> np.ndarray:
    """Predict using learned weights w (with intercept)."""
    return add_intercept(X) @ w

def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def split_train_val_test(df_in: pd.DataFrame, seed: int = 42) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Shuffle and split df_in into 60%/20%/20% train/val/test by row (single-pass).
    """
    n = len(df_in)
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train + n_val]
    test_idx = idx[n_train + n_val:]
    return df_in.iloc[train_idx].copy(), df_in.iloc[val_idx].copy(), df_in.iloc[test_idx].copy()

def split_train_val_test_2step(df_in: pd.DataFrame, seed: int = 42):
    """
    Two-step split to mimic lecture-style train_test_split twice with same random_state:
    1) split off 20% test
    2) re-shuffle remaining 80% with SAME seed and split 75/25 into train/val
    """
    n = len(df_in)
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    n_test = int(0.2 * n)
    test_idx = idx[:n_test]           # 20% test
    remaining = idx[n_test:]          # 80%
    
    rng2 = np.random.default_rng(seed)
    rng2.shuffle(remaining)
    n_val = int(0.25 * remaining.size)  # 25% of 80% = 20% total
    val_idx = remaining[:n_val]
    train_idx = remaining[n_val:]
    
    return df_in.iloc[train_idx].copy(), df_in.iloc[val_idx].copy(), df_in.iloc[test_idx].copy()


In [33]:
# Q1 CELL: Which column has missing values?
na_counts = df.isna().sum()
print(na_counts.to_string())
missing_cols = na_counts[na_counts > 0].index.tolist()
print("\nColumn(s) with missing values:", missing_cols)


engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0

Column(s) with missing values: ['horsepower']


In [34]:
# Q2 CELL: Median (50th percentile) for 'horsepower'
median_hp = df['horsepower'].median()
print("Median horsepower:", median_hp)


Median horsepower: 149.0


In [35]:
# Split
train_df, val_df, test_df = split_train_val_test(df, seed=42)

# Features/target
feat_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target_col = 'fuel_efficiency_mpg'

# Fill with 0
train_0 = train_df.copy()
val_0 = val_df.copy()
train_0['horsepower'] = train_0['horsepower'].fillna(0)
val_0['horsepower'] = val_0['horsepower'].fillna(0)

# Fill with mean (from TRAIN only)
hp_mean = train_df['horsepower'].mean()
train_mean = train_df.copy()
val_mean = val_df.copy()
train_mean['horsepower'] = train_mean['horsepower'].fillna(hp_mean)
val_mean['horsepower'] = val_mean['horsepower'].fillna(hp_mean)

# Convert to numpy
X_train_0 = train_0[feat_cols].to_numpy()
y_train = train_0[target_col].to_numpy()
X_val_0 = val_0[feat_cols].to_numpy()
y_val = val_0[target_col].to_numpy()

X_train_mean = train_mean[feat_cols].to_numpy()
X_val_mean = val_mean[feat_cols].to_numpy()

# Fit models
w_0 = fit_linear_regression(X_train_0, y_train, r=0.0)
w_mean = fit_linear_regression(X_train_mean, y_train, r=0.0)

# Evaluate
rmse_0 = rmse(y_val, predict(X_val_0, w_0))
rmse_mean = rmse(y_val, predict(X_val_mean, w_mean))

print("RMSE (fill with 0):   ", round(rmse_0, 2))
print("RMSE (fill with mean):", round(rmse_mean, 2))
print("\nBetter option:", "With mean" if round(rmse_mean, 2) < round(rmse_0, 2) else ("With 0" if round(rmse_0, 2) < round(rmse_mean, 2) else "Both are equally good"))


RMSE (fill with 0):    0.52
RMSE (fill with mean): 0.47

Better option: With mean


In [36]:
# Prepare (reusing the same split from Q3)
train_df, val_df, test_df = split_train_val_test(df, seed=42)

feat_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target_col = 'fuel_efficiency_mpg'

train_r = train_df.copy()
val_r = val_df.copy()
train_r['horsepower'] = train_r['horsepower'].fillna(0)
val_r['horsepower'] = val_r['horsepower'].fillna(0)

X_train = train_r[feat_cols].to_numpy()
y_train = train_r[target_col].to_numpy()
X_val = val_r[feat_cols].to_numpy()
y_val = val_r[target_col].to_numpy()

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results = {}

best_rmse = None
best_r = None

for r in r_values:
    w = fit_linear_regression(X_train, y_train, r=r)
    score = rmse(y_val, predict(X_val, w))
    results[r] = round(score, 2)
    if (best_rmse is None) or (round(score, 2) < best_rmse) or (round(score, 2) == best_rmse and r < best_r):
        best_rmse = round(score, 2)
        best_r = r

print("RMSE by r:", results)
print("Best r:", best_r, "with RMSE:", best_rmse)


RMSE by r: {0: 0.52, 0.01: 0.52, 0.1: 0.52, 1: 0.52, 5: 0.52, 10: 0.52, 100: 0.52}
Best r: 0 with RMSE: 0.52


In [None]:
feat_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target_col = 'fuel_efficiency_mpg'

rmses = []
for seed in range(10):
    train_df, val_df, test_df = split_train_val_test_2step(df, seed=seed)
    train_s = train_df.copy()
    val_s = val_df.copy()
    train_s['horsepower'] = train_s['horsepower'].fillna(0)
    val_s['horsepower'] = val_s['horsepower'].fillna(0)

    X_train = train_s[feat_cols].to_numpy()
    y_train = train_s[target_col].to_numpy()
    X_val = val_s[feat_cols].to_numpy()
    y_val = val_s[target_col].to_numpy()

    w = fit_linear_regression(X_train, y_train, r=0.0)
    score = rmse(y_val, predict(X_val, w))
    rmses.append(score)

std_val = float(np.std(rmses))
print("Validation RMSE by seed:", [round(s, 3) for s in rmses])
print("Std over seeds (rounded to 3):", round(std_val, 3))


Validation RMSE by seed: [0.516, 0.523, 0.522, 0.529, 0.524, 0.527, 0.516, 0.521, 0.529, 0.517]
Std over seeds (rounded to 3): 0.005


In [None]:
seed = 9
train_df, val_df, test_df = split_train_val_test_2step(df, seed=seed)

# Combine train + validation
trainval_df = pd.concat([train_df, val_df], axis=0)

# Impute horsepower with 0
for d in (trainval_df, test_df):
    d['horsepower'] = d['horsepower'].fillna(0)

feat_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target_col = 'fuel_efficiency_mpg'

X_trainval = trainval_df[feat_cols].to_numpy()
y_trainval = trainval_df[target_col].to_numpy()
X_test = test_df[feat_cols].to_numpy()
y_test = test_df[target_col].to_numpy()

# Train ridge with r=0.001
w = fit_linear_regression(X_trainval, y_trainval, r=0.001)
test_rmse = rmse(y_test, predict(X_test, w))

print("Test RMSE (seed=9, r=0.001):", round(test_rmse, 3))


Test RMSE (seed=9, r=0.001): 0.526
