In [14]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [16]:
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [17]:
# 1. Missing values - horsepower
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [18]:
# 2. Median horsepower - 149
df.horsepower.median()

np.float64(149.0)

In [19]:
# Split dataset
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]


In [20]:
y_train = df_train['fuel_efficiency_mpg']
y_val = df_val['fuel_efficiency_mpg']
y_test = df_test['fuel_efficiency_mpg']

# Remove labels
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [22]:
def train_lr(X, y):
  ones = np.ones(X.shape[0])
  X = np.column_stack([ones, X])

  XTX = X.T.dot(X)
  XTX_inv = np.linalg.inv(XTX)
  w_full = XTX_inv.dot(X.T).dot(y)

  return w_full[0], w_full[1:]

In [23]:
def rmse(y, y_pred):
  error = y - y_pred
  se = error ** 2
  return round(np.sqrt(se.mean()), 2)

In [24]:
# Use training dataset with 0 for hp 
X_train = df_train.fillna(0).values
w0, w = train_lr(X_train, y_train)
y_pred = w0 + X_train.dot(w)

w0, w, y_pred
rmse(y_train, y_pred)

np.float64(0.52)

In [25]:
# Use training dataset with mean for hp
mean_hp = df_train.horsepower.mean()
X_train = df_train.fillna(mean_hp).values
w0, w = train_lr(X_train, y_train)
y_pred = w0 + X_train.dot(w)

w0, w, y_pred
rmse(y_train, y_pred)

np.float64(0.46)

In [26]:
# Use validation dataset
mean_hp = df_val.horsepower.mean()
X_val = df_val.fillna(mean_hp).values
w0, w = train_lr(X_val, y_val)
y_pred = w0 + X_val.dot(w)

w0, w, y_pred
rmse(y_val, y_pred)

np.float64(0.46)

In [28]:
# 4 linear regression with regularization
def train_lr_r(X, y, r = 0.001):
  ones = np.ones(X.shape[0])
  X = np.column_stack([ones, X])

  XTX = X.T.dot(X)
  XTX = XTX + r * np.eye(XTX.shape[0])
  XTX_inv = np.linalg.inv(XTX)
  w_full = XTX_inv.dot(X.T).dot(y)

  return w_full[0], w_full[1:]

In [None]:
# Use training dataset with 0 for hp, and regularization
X_train = df_train.fillna(0).values

rs = [0, 0.01, 0.1, 1, 5, 10, 100]

#4: 0, 0.01, 0.1 - best score
for r in rs:
  w0, w = train_lr_r(X_train, y_train, r)
  y_pred = w0 + X_train.dot(w)

  print('R', r, rmse(y_train, y_pred))
  
  

R 0 0.52
R 0.01 0.52
R 0.1 0.52
R 1 0.53
R 5 0.53
R 10 0.53
R 100 0.53


In [None]:
# Use validation dataset with reg
X_val = df_val.fillna(0).values
w0, w = train_lr_r(X_val, y_val, 0)
y_pred = w0 + X_val.dot(w)

w0, w, y_pred
rmse(y_val, y_pred)

np.float64(0.52)

In [None]:
# 5 different seeds: std 0.005
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

res = []
for s in seeds:
  np.random.seed(s)
  s_idx = np.random.permutation(idx)

  df_train = df.iloc[s_idx[:n_train]]
  df_val = df.iloc[s_idx[n_train:n_train + n_val]]
  df_test = df.iloc[s_idx[n_train + n_val:]]

  y_train = df_train['fuel_efficiency_mpg']
  y_val = df_val['fuel_efficiency_mpg']
  y_test = df_test['fuel_efficiency_mpg']

  # Remove labels
  del df_train['fuel_efficiency_mpg']
  del df_val['fuel_efficiency_mpg']
  del df_test['fuel_efficiency_mpg']

  X_train = df_train.fillna(0).values
  w0, w = train_lr(X_train, y_train)
  y_pred = w0 + X_train.dot(w)

  w0, w, y_pred
  rmse_s = rmse(y_train, y_pred)
  res.append(rmse_s)

round(np.std(res), 3)

np.float64(0.005)

In [None]:
# 6: full rmse - 0.52
np.random.seed(9)
s_idx = np.random.permutation(idx)

df_train = df.iloc[s_idx[:n_train]]
df_val = df.iloc[s_idx[n_train:n_train + n_val]]
df_test = df.iloc[s_idx[n_train + n_val:]]

y_train = df_train['fuel_efficiency_mpg']
y_val = df_val['fuel_efficiency_mpg']
y_test = df_test['fuel_efficiency_mpg']

# Remove labels
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

X_full_train = pd.concat([df_train, df_val]).fillna(0).values
y_full_train = np.concatenate([y_train, y_val])

w0, w = train_lr_r(X_full_train, y_full_train, 0.001)
y_full_pred = w0 + X_full_train.dot(w)

# w0, w, y_pred
rmse_full = rmse(y_full_train, y_full_pred)
rmse_full

np.float64(0.52)