In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('car_fuel_efficiency.csv')
df = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [None]:
# Check for missing values in the dataset.
check_df = df.isnull().sum()
print(check_df)

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


In [7]:
df['horsepower'].median()

149.0

In [20]:
def prepare_and_split_data(df, seed=42, val_size=0.2, test_size=0.2):
    """
    Shuffle and split dataset into train/val/test sets
    """
    # Shuffle the dataset
    n = len(df)
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx].reset_index(drop=True)
    
    # Split the data
    n_val = int(n * val_size)
    n_test = int(n * test_size)
    n_train = n - n_val - n_test
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    
    return df_train, df_val, df_test

# Split with seed 42
df_train, df_val, df_test = prepare_and_split_data(df, seed=42)

print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")

Train size: 5824
Validation size: 1940
Test size: 1940


In [21]:
def prepare_X_y(df, fill_value=0):
    """
    Prepare features and target from dataframe
    """
    df_processed = df.copy()
    df_processed['horsepower'] = df_processed['horsepower'].fillna(fill_value)
    
    # Prepare X (features) and y (target)
    X = df_processed[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
    y = df_processed['fuel_efficiency_mpg'].values
    
    return X, y

def train_linear_regression(X, y):
    """
    Train linear regression with normal equation
    """
    # Add bias term
    ones = np.ones(X.shape[0])
    X_with_bias = np.column_stack([ones, X])
    
    # Normal equation: w = (X^T X)^-1 X^T y
    XTX = X_with_bias.T.dot(X_with_bias)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X_with_bias.T).dot(y)
    
    return w

def predict(X, w):
    """
    Make predictions
    """
    ones = np.ones(X.shape[0])
    X_with_bias = np.column_stack([ones, X])
    return X_with_bias.dot(w)

def rmse(y_true, y_pred):
    """
    Calculate RMSE
    """
    mse = np.mean((y_true - y_pred) ** 2)
    return np.sqrt(mse)

# Option 1: Fill with 0
X_train_0, y_train = prepare_X_y(df_train, fill_value=0)
X_val_0, y_val = prepare_X_y(df_val, fill_value=0)

w_0 = train_linear_regression(X_train_0, y_train)
y_pred_0 = predict(X_val_0, w_0)
rmse_0 = round(rmse(y_val, y_pred_0), 2)

# Option 2: Fill with mean (computed on training set only!)
mean_horsepower = df_train['horsepower'].mean()
X_train_mean, y_train = prepare_X_y(df_train, fill_value=mean_horsepower)
X_val_mean, y_val = prepare_X_y(df_val, fill_value=mean_horsepower)

w_mean = train_linear_regression(X_train_mean, y_train)
y_pred_mean = predict(X_val_mean, w_mean)
rmse_mean = round(rmse(y_val, y_pred_mean), 2)

print("\n=== QUESTION 3 ===")
print(f"RMSE (fill with 0): {rmse_0}")
print(f"RMSE (fill with mean): {rmse_mean}")
print(f"Better option: {'0' if rmse_0 < rmse_mean else 'mean'}")


=== QUESTION 3 ===
RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.47
Better option: mean


In [22]:
def train_linear_regression_regularized(X, y, r=0):
    """
    Train regularized linear regression (Ridge regression)
    """
    # Add bias term
    ones = np.ones(X.shape[0])
    X_with_bias = np.column_stack([ones, X])
    
    # Regularized normal equation: w = (X^T X + r*I)^-1 X^T y
    XTX = X_with_bias.T.dot(X_with_bias)
    I = np.eye(XTX.shape[0])
    w = np.linalg.inv(XTX + r * I).dot(X_with_bias.T).dot(y)
    
    return w

# Fill NAs with 0
X_train, y_train = prepare_X_y(df_train, fill_value=0)
X_val, y_val = prepare_X_y(df_val, fill_value=0)

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

print("\n=== QUESTION 4 ===")
for r in r_values:
    w = train_linear_regression_regularized(X_train, y_train, r=r)
    y_pred = predict(X_val, w)
    rmse_score = round(rmse(y_val, y_pred), 2)
    rmse_scores[r] = rmse_score
    print(f"r={r}: RMSE={rmse_score}")

best_r = min(rmse_scores, key=lambda k: (rmse_scores[k], k))
print(f"Best r: {best_r} with RMSE: {rmse_scores[best_r]}")



=== QUESTION 4 ===
r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.53
r=1: RMSE=0.53
r=5: RMSE=0.53
r=10: RMSE=0.53
r=100: RMSE=0.53
Best r: 0 with RMSE: 0.52


In [23]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores_seeds = []

print("\n=== QUESTION 5 ===")
for seed in seeds:
    df_train_s, df_val_s, df_test_s = prepare_and_split_data(df, seed=seed)
    
    X_train_s, y_train_s = prepare_X_y(df_train_s, fill_value=0)
    X_val_s, y_val_s = prepare_X_y(df_val_s, fill_value=0)
    
    w_s = train_linear_regression(X_train_s, y_train_s)
    y_pred_s = predict(X_val_s, w_s)
    rmse_s = rmse(y_val_s, y_pred_s)
    rmse_scores_seeds.append(rmse_s)
    print(f"Seed {seed}: RMSE={round(rmse_s, 2)}")

std_rmse = round(np.std(rmse_scores_seeds), 3)
print(f"Standard deviation of RMSE scores: {std_rmse}")


=== QUESTION 5 ===
Seed 0: RMSE=0.52
Seed 1: RMSE=0.51
Seed 2: RMSE=0.52
Seed 3: RMSE=0.53
Seed 4: RMSE=0.53
Seed 5: RMSE=0.52
Seed 6: RMSE=0.51
Seed 7: RMSE=0.53
Seed 8: RMSE=0.51
Seed 9: RMSE=0.52
Standard deviation of RMSE scores: 0.008


In [24]:
print("\n=== QUESTION 6 ===")
# Split with seed 9
df_train_9, df_val_9, df_test_9 = prepare_and_split_data(df, seed=9)

# Combine train and validation
df_train_full = pd.concat([df_train_9, df_val_9]).reset_index(drop=True)

# Prepare data
X_train_full, y_train_full = prepare_X_y(df_train_full, fill_value=0)
X_test_9, y_test_9 = prepare_X_y(df_test_9, fill_value=0)

# Train with r=0.001
w_final = train_linear_regression_regularized(X_train_full, y_train_full, r=0.001)
y_pred_test = predict(X_test_9, w_final)
rmse_test = round(rmse(y_test_9, y_pred_test), 2)

print(f"RMSE on test dataset (seed=9, r=0.001): {rmse_test}")


=== QUESTION 6 ===
RMSE on test dataset (seed=9, r=0.001): 0.53
