In [359]:
import pandas as pd
import numpy as np
# Reading dataset
df = pd.read_csv("/home/marcus-vinicius/Desktop/Python/Machine_Leaning_Zoomcamp/car_fuel_efficiency2.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB
None


In [360]:
columns = [
    
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'

]

df_col= df[columns]
df_col.isnull().any()

engine_displacement    False
horsepower              True
vehicle_weight         False
model_year             False
fuel_efficiency_mpg    False
dtype: bool

In [361]:
df_col['horsepower'].median(), df_col['horsepower'].mean() 

(149.0, 149.65729212983547)

In [362]:
n = len(df_col)

n_test = int(n*0.2)
n_val = int(n*0.2)
n_train = n - n_test - n_val

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df_col.iloc[idx[:n_train]].reset_index(drop = True)
df_val = df_col.iloc[idx[:n_val]].reset_index(drop = True)
df_test = df_col.iloc[idx[:n_test]].reset_index(drop = True)

In [363]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
# Deletting 'fuel_efficiency_mpg' col so we dont have 100% model
del df_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']

In [364]:
X_train = df_train.fillna(0).values
X_val = df_val.fillna(0).values
X_test = df_test.fillna(0).values

In [365]:
def dot(xi, w):
    n = len(xi)
    res = 0.0

    for j in range(n):
        res = res + xi[j]*w[j]
    
    return res 

In [366]:
def train_linear_regression(X, y):
    # New col of X
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    # Matrix operations
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    XTX.dot(XTX_inv).round(1)
    # Normal equation
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

X_train = df_train.fillna(0).values
w0, w = train_linear_regression(X_train, y_train)

w0, w

(3.6646662473557985,
 array([ 2.14827083e-06,  2.21209296e-04, -3.23460699e-04,  1.64673231e-05]))

In [367]:
y_train_pred = w0 + X_train.dot(w)
y_train_pred

array([2.90981589, 2.84278709, 2.93310627, ..., 2.92329662, 3.00305539,
       2.94404905])

In [368]:
def rmse(y, y_pred):
    sq = (y - y_pred)**2
    mse = sq.mean()
    return np.sqrt(mse)

rmse(y_train, y_train_pred)

0.0390863253926525

In [369]:
w0, w = train_linear_regression(X_train, y_train)
y_val_pred = w0 + X_val.dot(w)
rmse(y_val, y_val_pred)

0.03909202087549413

In [370]:
X_train = df_train.fillna(149).values
w0, w = train_linear_regression(X_train, y_train)
y_train_pred = w0 + X_train.dot(w)
rmse(y_train, y_train_pred)

0.03581253276160714

In [371]:
X_val = df_val.fillna(149).values
y_val_pred = w0 + X_val.dot(w)
rmse(y_val, y_val_pred)

0.03541741870392893

In [372]:
def train_linear_regression_reg(X, y, r):
    # New col of X
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    # Matrix operations
    XTX = X.T.dot(X)
    #Add r to all values in the diagonal
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    XTX.dot(XTX_inv).round(1)
    # Normal equation
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [373]:
X_train = df_train.fillna(0).values

In [374]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = df_train.fillna(0).values
    w0, w = train_linear_regression_reg(X_train, y_train, r)
    X_val = df_val.fillna(0).values
    y_val_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_val_pred)
    print(r, score)

0 0.03909202087549413
0.01 0.039162091658291535
0.1 0.03994102844855213
1 0.04095010036686897
5 0.04112507210838927
10 0.041148736616890634
100 0.04117040392671834


In [375]:
scores = []
for seed in range(10):
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df_col.iloc[idx[:n_train]].reset_index(drop = True)
    df_val = df_col.iloc[idx[:n_val]].reset_index(drop = True)
    df_test = df_col.iloc[idx[:n_test]].reset_index(drop = True)

    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
    # Deletting 'fuel_efficiency_mpg' col so we dont have 100% model
    del df_train['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    
    X_train = df_train.fillna(0).values
    X_val = df_val.fillna(0).values
    X_test = df_test.fillna(0).values

    w0, w = train_linear_regression(X_train, y_train)
    y_train_pred = w0 + X_train.dot(w)
    
    y_val_pred = w0 + X_val.dot(w)
    val_rmse = rmse(y_val, y_val_pred)
    scores.append(val_rmse)
    
print(scores)

[0.040195094727686884, 0.0393361067258708, 0.03940481794462767, 0.038241698193598565, 0.038640606128982735, 0.03743359660082332, 0.03933990503866994, 0.039091822043245454, 0.039607723048244725, 0.038353163832865494]


In [376]:
scores_array = np.array(scores)

# Use np.std() to calculate the standard deviation
std_dev = np.std(scores_array)

print(f"The RMSE scores are: {scores}")
print(f"The standard deviation of the scores is: {std_dev:.3f}")

The RMSE scores are: [0.040195094727686884, 0.0393361067258708, 0.03940481794462767, 0.038241698193598565, 0.038640606128982735, 0.03743359660082332, 0.03933990503866994, 0.039091822043245454, 0.039607723048244725, 0.038353163832865494]
The standard deviation of the scores is: 0.001


In [377]:
idx = np.arange(n)
np.random.seed(seed)
np.random.shuffle(idx)


def prepare_split(df):
    y = np.log1p(df["fuel_efficiency_mpg"].values)
    X = df.drop(columns=["fuel_efficiency_mpg"]).fillna(0).values
    return X, y

# splits
df_train = df_col.iloc[idx[:n_train]].reset_index(drop=True)
df_val   = df_col.iloc[idx[:n_val]].reset_index(drop=True)
df_test  = df_col.iloc[idx[:n_test]].reset_index(drop=True)

X_train, y_train = prepare_split(df_train)
X_val,   y_val   = prepare_split(df_val)
X_test,  y_test  = prepare_split(df_test)


df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
X_full_train, y_full_train = prepare_split(df_full_train)


w0, w = train_linear_regression_reg(X_full_train, y_full_train, 0.001)


y_pred = w0 + X_test.dot(w)
print("RMSE:", rmse(y_test, y_pred))


RMSE: 0.03834219659521162
