In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df= pd.read_csv('Data/car_fuel_efficiency.csv')
df.shape

(9704, 11)

In [5]:
df =df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   horsepower           8996 non-null   float64
 2   vehicle_weight       9704 non-null   float64
 3   model_year           9704 non-null   int64  
 4   fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 379.2 KB


### Question 1. There's one column with missing values. What is it?

In [6]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

### Question 2. What's the median (50% percentile) for variable 'horsepower'?

In [7]:
df['horsepower'].describe()

count    8996.000000
mean      149.657292
std        29.879555
min        37.000000
25%       130.000000
50%       149.000000
75%       170.000000
max       271.000000
Name: horsepower, dtype: float64

### Prepare and split the dataset

In [36]:
# Shuffle the dataset (the filtered one you created above), use seed 42.
# Split your data in train/val/test sets, with 60%/20%/20% distribution.

from sklearn.model_selection import train_test_split

def prepare_data(df: pd.DataFrame, seed: int = 42):

    df_s = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    train_size = 0.6
    val_size = 0.2
    test_size = 0.2

    X = df_s.drop(columns=['fuel_efficiency_mpg'])
    y = df_s['fuel_efficiency_mpg']

    X_train, X_temp, y_train, y_temp = train_test_split(
                                    X, y, test_size = (1 - train_size), 
                                    random_state=seed, 
                                    )

    X_val, X_test, y_val, y_test = train_test_split(
                                X_temp, y_temp, test_size = (test_size / (val_size + test_size)), 
                                random_state=seed, 
                                )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

### Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)

Which option gives better RMSE?

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Fill with 0
df_1 = df.copy()
df_1 = df.fillna(0)

# Fill with mean
df_2 = df.copy()
mean_hp = df_2['horsepower'].mean()
df_2['horsepower'] = df_2['horsepower'].fillna(mean_hp)


def train_model(X_train, y_train, X_val, y_val):  
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    return val_rmse

In [11]:
X_train, X_val, _, y_train, y_val, _ = prepare_data(df_1)

rmse_1 = train_model(X_train, y_train, X_val, y_val)

X_train, X_val, _, y_train, y_val, _ = prepare_data(df_2)

rmse_2 = train_model(X_train, y_train, X_val, y_val)

print(f'RMSE Fill with 0: {round(rmse_1, 2)}')
print(f'RMSE Fill with mean: {round(rmse_2, 2)}')

RMSE Fill with 0: 0.51
RMSE Fill with mean: 0.46


### Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

If multiple options give the same best RMSE, select the smallest r.

In [18]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results = {}

X_train, X_val, _, y_train, y_val, _ = prepare_data(df_1)

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results[r] = round(rmse, 2)

results

{0: np.float64(0.51),
 0.01: np.float64(0.51),
 0.1: np.float64(0.51),
 1: np.float64(0.51),
 5: np.float64(0.51),
 10: np.float64(0.51),
 100: np.float64(0.51)}

### Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

What's the value of std?

In [37]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
metrics = []
results = {}

for seed in seeds:
    X_train, X_val, _, y_train, y_val, _ = prepare_data(df_1, seed=seed)
    rmse = train_model(X_train, y_train, X_val, y_val)
    metrics.append(rmse)
    results[f'rmse-with-seed-{seed}'] = rmse

results


{'rmse-with-seed-0': np.float64(0.518029012129703),
 'rmse-with-seed-1': np.float64(0.5090406519449436),
 'rmse-with-seed-2': np.float64(0.5141627479366626),
 'rmse-with-seed-3': np.float64(0.5152929065813446),
 'rmse-with-seed-4': np.float64(0.5184898612166595),
 'rmse-with-seed-5': np.float64(0.5236053324442335),
 'rmse-with-seed-6': np.float64(0.5150251935468099),
 'rmse-with-seed-7': np.float64(0.5249385057075809),
 'rmse-with-seed-8': np.float64(0.5076028899022499),
 'rmse-with-seed-9': np.float64(0.5292310655399466)}

In [38]:
round(np.std(metrics),3)

np.float64(0.007)

### Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.

What's the RMSE on the test dataset?

In [44]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(df_1, seed=9)

X_full_train = pd.concat([X_train, X_val])
y_full_train = pd.concat([y_train, y_val])

model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
round(rmse, 2)

np.float64(0.53)