In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('datasets/lab2/laptops.csv')

In [3]:
data

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [4]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [5]:
data.head()

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [6]:
features = ['ram', 'storage', 'screen', 'final_price']
selected_columns = data[features]
selected_columns.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [7]:
selected_columns.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [8]:
selected_columns.ram.median()

np.float64(16.0)

In [9]:
np.random.seed(42)

n = len(selected_columns)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = selected_columns.iloc[idx]
train_x = data_shuffled.iloc[:n_train].copy()
val_x = data_shuffled.iloc[n_train:n_train+n_val].copy()
test_x = data_shuffled.iloc[n_train+n_val:].copy()

In [10]:
train_y_orig = train_x.final_price.values
val_y_orig = val_x.final_price.values
test_y_orig = test_x.final_price.values

In [11]:
train_y = np.log1p(train_x.final_price.values)
val_y = np.log1p(val_x.final_price.values)
test_y = np.log1p(test_x.final_price.values)

In [12]:
target = 'final_price'
del train_x[target]
del val_x[target]
del test_x[target]

In [13]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [14]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [15]:
def prepare_X_with_0(data):
    data = data.fillna(0)
    X = data.values
    return X

In [16]:
def prepare_X_with_mean(data):
    mean = data['screen'].mean()
    data = data.fillna(mean)
    X = data.values
    return X

In [17]:
X_train_1 = prepare_X_with_0(train_x)
w0_1, w1_1 = train_linear_regression(X_train_1, train_y)

X_val_1 = prepare_X_with_0(val_x)
y_pred_1 = w0_1 + X_val_1.dot(w1_1)

In [18]:
rmse_1 = round(rmse(val_y, y_pred_1), 2)
rmse_1

np.float64(0.43)

In [19]:
X_train_2 = prepare_X_with_mean(train_x)
w0_2, w1_2 = train_linear_regression(X_train_2, train_y)

X_val_2 = prepare_X_with_mean(val_x)
y_pred_2 = w0_2 + X_val_2.dot(w1_2)

In [20]:
rmse_2 = round(rmse(val_y, y_pred_2), 2)
rmse_2

np.float64(0.43)

In [21]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [22]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0_3, w1_3 = train_linear_regression_reg(X_train_1, train_y, r=r)

    X_val_3 = prepare_X_with_mean(val_x)
    y_pred_3 = w0_3 + X_val_3.dot(w1_3)

    rmse_3 = round(rmse(val_y, y_pred_3), 2)
    print(f"{r}: {rmse_3}")

0: 0.43
0.01: 0.43
0.1: 0.43
1: 0.43
5: 0.45
10: 0.48
100: 0.6


In [23]:
stds = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    n = len(selected_columns)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    data_shuffled_1 = selected_columns.iloc[idx]
    train_x_1 = data_shuffled_1.iloc[:n_train].copy()
    val_x_1 = data_shuffled_1.iloc[n_train:n_train+n_val].copy()
    test_x_1 = data_shuffled_1.iloc[n_train+n_val:].copy()

    train_y_1 = np.log1p(train_x_1.final_price.values)
    val_y_1 = np.log1p(val_x_1.final_price.values)
    test_y_1 = np.log1p(test_x_1.final_price.values)

    del train_x_1[target]
    del val_x_1[target]
    del test_x_1[target]

    filled_train_x = prepare_X_with_0(train_x_1)
    w0_4, w1_4 = train_linear_regression(filled_train_x, train_y_1)

    filled_val_x = prepare_X_with_0(val_x_1)
    y_pred_4 = w0_4 + filled_val_x.dot(w1_4)

    rmse_4 = rmse(val_y_1, y_pred_4)
    stds.append(rmse_4)
print(round(np.std(stds), 3))

0.014


In [24]:
np.random.seed(9)

In [25]:
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = selected_columns.iloc[idx]

x_train = data_shuffled_1.iloc[:n_train].copy()
x_val = data_shuffled_1.iloc[n_train:n_train+n_val].copy()
X_test = data_shuffled_1.iloc[n_train+n_val:].copy()

In [26]:
combined_train_val_x = pd.concat([x_train, x_val], axis=0)
combined_train_val_y = np.log1p(np.concatenate([x_train.final_price.values, x_val.final_price.values]))

In [28]:
train_y_orig = x_train.final_price.values
val_y_orig = x_val.final_price.values
test_y_orig = X_test.final_price.values

In [29]:
test_y = np.log1p(X_test.final_price.values)

In [30]:
del combined_train_val_x['final_price']
del X_test['final_price']

In [33]:
X_prep = prepare_X_with_0(combined_train_val_x)
w0, w1 = train_linear_regression_reg(X_prep, combined_train_val_y, r=0.001)

In [34]:
X_test_prep = prepare_X_with_0(X_test)
y_pred = w0 + X_test_prep.dot(w1)

In [35]:
rmse(test_y, y_pred)

np.float64(0.4553047228090903)