# Homework #2



In [55]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [56]:
# Load the data
df = pd.read_csv('laptops.csv')

In [57]:
# Normalize the columns
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [58]:
# Use only the required columns
df = df[["ram", "storage", "screen", "final_price"]]

In [59]:
df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [60]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [61]:
df.ram.median()

np.float64(16.0)

# Question 3
## Prepare and split the dataset

In [62]:
def split_data(train_pct, val_pct, test_pct, seed=None):
    if train_pct + val_pct + test_pct != 100:
        raise ValueError("The sum of the percentages should be equal to 100.")
    
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - (n_val + n_test)
    
    # shuffle and index
    idx = np.arange(n)
    
    if seed is not None:
        np.random.seed(seed)
    
    np.random.shuffle(idx)
    
    # split the data
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # reset the index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    return df_train, df_val, df_test
    


In [63]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [64]:
def prepare_X_fill_with_0(df:pd.DataFrame):
    df_num = df.fillna(0)
    X = df_num.values
    return X

def prepare_X_fill_with_mean(df:pd.DataFrame):
    df_num = df.fillna(df.mean())
    X = df_num.values
    return X

In [65]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)


In [66]:
# Split the data
df_train, df_val, df_test = split_data(60, 20, 20, seed=42)

In [71]:
# adjust y values
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

# drop the target column
del df_train["final_price"]
del df_val["final_price"]
del df_test["final_price"]


AttributeError: 'DataFrame' object has no attribute 'final_price'

In [72]:
# Filled with 0
X_train_0 = prepare_X_fill_with_0(df_train)
w_0, w = train_linear_regression(X_train_0, y_train)

y_pred = w_0 + X_train_0.dot(w)
print("Train RMSE (filled with 0):", round(rmse(y_train, y_pred),2))

Train RMSE (filled with 0): 602.39


In [73]:
# Filled with mean
X_train_mean = prepare_X_fill_with_mean(df_train)
w_0, w = train_linear_regression(X_train_mean, y_train)

y_pred = w_0 + X_train_mean.dot(w)
print("Train RMSE (filled with 0):", round(rmse(y_train, y_pred),2))

Train RMSE (filled with 0): 602.18


# QUESTION 4


In [78]:
rs = [0, 0.01, 0.1, 1, 10, 100]
rmses = list()
for r in rs:
    X_train_0 = prepare_X_fill_with_0(df_train)
    w_0, w = train_linear_regression_reg(X_train_0, y_train, r=r)
    
    X_val_0 = prepare_X_fill_with_0(df_val)
    y_pred = w_0 + X_val_0.dot(w)
    rmsee = rmse(y_val, y_pred)
    print(f"{r=}\t rmse={round(rmsee,2)}")
    rmses.append(rmsee)
    
print(min(rmses))

r=0	 rmse=597.36
r=0.01	 rmse=597.36
r=0.1	 rmse=597.35
r=1	 rmse=597.21
r=10	 rmse=597.06
r=100	 rmse=597.9
597.0587680661115


In [79]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_values = list()
for seed in seed_values:
    df_train, df_val, df_test = split_data(60, 20, 20, seed=seed)
    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values
    del df_train["final_price"]
    del df_val["final_price"]
    del df_test["final_price"]

    X_train_0 = prepare_X_fill_with_0(df_train)
    w_0, w = train_linear_regression(X_train_0, y_train)
    
    X_val = prepare_X_fill_with_0(df_val)
    y_pred = X_val.dot(w) + w_0
    
    rmse_value = rmse(y_val, y_pred)
    rmse_values.append(rmse_value)
    print(f"{seed=}\t rmse={round(rmse_value,2)}")
    
rmse_std = np.std(rmse_values)
print(f"rmse_std={round(rmse_std, 3)}, ({rmse_std})")

seed=0	 rmse=565.45
seed=1	 rmse=636.8
seed=2	 rmse=588.96
seed=3	 rmse=597.81
seed=4	 rmse=571.96
seed=5	 rmse=573.24
seed=6	 rmse=647.34
seed=7	 rmse=550.44
seed=8	 rmse=587.33
seed=9	 rmse=576.1
rmse_std=29.176, (29.17649125829274)


In [80]:
# Question 6
df_train, df_val, df_test = split_data(60, 20, 20, seed=9)
df_full_train = pd.concat([df_train, df_val])

y_full_train = df_full_train.final_price.values
y_test = df_test.final_price.values

del df_full_train["final_price"]
del df_test["final_price"]

X_full_train = prepare_X_fill_with_0(df_full_train)
X_test = prepare_X_fill_with_0(df_test)

w_0, w = train_linear_regression(X_full_train, y_full_train)
y_pred = w_0 + X_test.dot(w)
rm = rmse(y_test, y_pred)

print(rm)


608.6102791248045
