In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('datasets/lab2/laptops.csv')

In [3]:
data

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [4]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [5]:
data.head()

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [6]:
features = ['ram', 'storage', 'screen', 'final_price']
selected_columns = data[features]
selected_columns.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [7]:
selected_columns.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [8]:
selected_columns.ram.median()

np.float64(16.0)

In [9]:
np.random.seed(42)

n = len(selected_columns)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

In [10]:
data_train = selected_columns.iloc[:n_train]
data_val = selected_columns.iloc[n_train:n_train+n_val]
data_test = selected_columns.iloc[n_train+n_val:]

In [11]:
idx = np.arange(n)

In [12]:
np.random.shuffle(idx)

In [13]:
data_train = selected_columns.iloc[idx[:n_train]]
data_val = selected_columns.iloc[idx[n_train:n_train+n_val]]
data_test = selected_columns.iloc[idx[n_train+n_val:]]

In [14]:
data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [15]:
y_train = np.log1p(data_train.final_price.values)
y_val = np.log1p(data_val.final_price.values)
y_test = np.log1p(data_test.final_price.values)

In [16]:
del data_train['final_price']
del data_val['final_price']
del data_test['final_price']

## Linear regression

In [17]:
w0 = 7.17
w = [0.01, 0.04]

In [18]:
w_new = [w0] + w
w_new

[7.17, 0.01, 0.04]

In [19]:
def linear_regression(X):
    return X.dot(w_new)

In [20]:
linear_regression(data_train)

0       240.064
1        29.884
2       240.000
3       120.372
4       240.080
         ...   
1291     60.480
1292    240.080
1293    120.464
1294    120.400
1295    120.400
Length: 1296, dtype: float64

In [27]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [23]:
def prepare_X_with_0(data):
    data = data.fillna(0)
    X = data.values
    return X

In [24]:
def prepare_X_with_mean(data):
    data = data.fillna(data['screen'].mean())
    X = data.values
    return X

In [25]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [28]:
X_prepared = prepare_X_with_0(data_train)
w0, w = train_linear_regression(X_prepared, y_train)

X_val_prepared = prepare_X_with_0(data_val)
y_pred = w0 + X_val_prepared.dot(w)
round(rmse(y_val, y_pred), 2)

np.float64(0.43)

In [29]:
X_prepared = prepare_X_with_mean(data_train)
w0, w = train_linear_regression(X_prepared, y_train)

X_val_prepared = prepare_X_with_mean(data_val)
y_pred = w0 + X_val_prepared.dot(w)
round(rmse(y_val, y_pred), 2)

np.float64(0.43)

## Reg linear regression

In [34]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [31]:
rs = [0, 0.01, 0.1, 1, 5, 10, 100]
X_prepared = prepare_X_with_0(data_train)
X_val_prepared = prepare_X_with_0(data_val)

In [36]:
for r in rs:
    w0, w = train_linear_regression_reg(X_prepared, y_train, r=r)
    y_pred = w0 + X_val_prepared.dot(w)
    print(f"{r} {round(rmse(y_val, y_pred), 2)}")

0 0.43
0.01 0.43
0.1 0.43
1 0.43
5 0.46
10 0.51
100 0.67


In [43]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [44]:
rmses = []

In [46]:
for seed in seeds:
    data_train = selected_columns.iloc[:n_train]
    data_val = selected_columns.iloc[n_train:n_train+n_val]
    data_test = selected_columns.iloc[n_train+n_val:]

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    data_train = selected_columns.iloc[idx[:n_train]].reset_index(drop=True)
    data_val = selected_columns.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    data_test = selected_columns.iloc[idx[n_train+n_val:]].reset_index(drop=True)
    
    y_train = np.log1p(data_train.final_price.values)
    y_val = np.log1p(data_val.final_price.values)
    y_test = np.log1p(data_test.final_price.values)
    
    del data_train['final_price']
    del data_val['final_price']
    del data_test['final_price']

    X_prepared = prepare_X_with_0(data_train)
    X_val_prepared = prepare_X_with_0(data_val)
    w0, w  = train_linear_regression(X_prepared, y_train)
    y_pred = w0 + X_val_prepared.dot(w)
    rmses.append(rmse(y_val, y_pred))

In [47]:
round(np.std(rmses), 3)

np.float64(0.014)

In [48]:
np.random.seed(9)

In [49]:
data_train = selected_columns.iloc[:n_train]
data_val = selected_columns.iloc[n_train:n_train+n_val]
data_test = selected_columns.iloc[n_train+n_val:]

idx = np.arange(n)
np.random.seed(seed)
np.random.shuffle(idx)

data_train = selected_columns.iloc[idx[:n_train]].reset_index(drop=True)
data_val = selected_columns.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
data_test = selected_columns.iloc[idx[n_train+n_val:]].reset_index(drop=True)

y_train = np.log1p(data_train.final_price.values)
y_val = np.log1p(data_val.final_price.values)
y_test = np.log1p(data_test.final_price.values)

del data_train['final_price']
del data_val['final_price']
del data_test['final_price']

In [53]:
X_train_val = pd.DataFrame(np.concat([data_train, data_val]))

In [54]:
y_train_val = pd.DataFrame(np.concat([y_train, y_pred]))

In [55]:
X_prepared = prepare_X_with_0(X_train_val)
X_test_prep = prepare_X_with_0(data_test)

In [56]:
w0, w = train_linear_regression_reg(X_prepared, y_train_val, r=0.001)
y_pred = w0 + X_test_prep.dot(w)

In [57]:
rmse(y_test, y_pred)

np.float64(0.8391907836810745)