In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')

In [3]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [34]:
base = df[['ram', 'storage', 'screen', 'final_price']]
base

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.00
1,8,256,15.6,299.00
2,8,256,15.6,789.00
3,16,1000,15.6,1199.00
4,16,512,15.6,669.01
...,...,...,...,...
2155,16,1000,17.3,2699.99
2156,16,1000,17.3,2899.99
2157,32,1000,17.3,3399.99
2158,16,1000,13.4,1899.99


In [35]:
base.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [8]:
ram_median_value = base['ram'].median()
ram_median_value

16.0

In [9]:
n = len(df)
n_test = int(n *0.2)
n_val = int(n * 0.2)
n_train = n - n_test - n_val

In [10]:
n_train, n_val, n_test

(1296, 432, 432)

In [11]:
df_train = base.iloc[n_train:] 
df_val = base.iloc[n_train:n_train + n_val]
df_test = base.iloc[n_train+n_val:]

In [12]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
idx

array([2079,  668, 2073, ..., 1130, 1294,  860])

In [14]:
df_train = base.iloc[idx[:n_train]].copy()
df_val = base.iloc[idx[n_train:n_train + n_val]].copy()
df_test = base.iloc[idx[n_train + n_val:]].copy()

In [15]:
df_train=df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index( drop = True)

In [16]:
len(df_train), len(df_val), len(df_test)

(1296, 432, 432)

In [36]:
mean_values = df_train['screen'].mean()

In [37]:
y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

In [47]:
X_train = df_train[['ram', 'storage', 'screen']].fillna(0).values
X_val = df_val[['ram', 'storage', 'screen']].fillna(0).values
X_test = df_test[['ram', 'storage', 'screen']].fillna(0).values


In [48]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [49]:
w0, w = train_linear_regression(X_train, y_train)


In [50]:
w0, w

(6.727163706120431, array([ 0.02630589,  0.00066041, -0.03651909]))

In [51]:
X_train1= df_train[['ram', 'storage', 'screen']].fillna(mean_values).values
X_val1 = df_val[['ram', 'storage', 'screen']].fillna(mean_values).values
X_test1 = df_test[['ram', 'storage', 'screen']].fillna(mean_values).values



In [52]:
w0 ,w = train_linear_regression(X_train1, y_train)


In [53]:
w0 , w

(6.727163706120431, array([ 0.02630589,  0.00066041, -0.03651909]))

In [54]:
y_pred = w0 + X_train.dot(w)

In [55]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)


## rmse for 0

In [56]:
score = rmse(y_train, y_pred)
round(score, 2)

0.45

## for mean

In [57]:
y_pred = w0 + X_train1.dot(w)

score = rmse(y_train, y_pred)
round(score, 2)

0.45

## question 4

In [58]:
def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])    
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [59]:
for r in [0.0, 0.01, 0.1, 1, 5, 10, 100]:
    w0, w = train_linear_regression_reg(X_train,y_train, r=r)

    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val,y_pred)
    score = round(score,2)
    
    print(f"r = {r},  w0 = {w0},  score = {score}")

r = 0.0,  w0 = 6.727163706120431,  score = 0.43
r = 0.01,  w0 = 6.721004333268212,  score = 0.43
r = 0.1,  w0 = 6.666075503835682,  score = 0.43
r = 1,  w0 = 6.162616499467005,  score = 0.43
r = 5,  w0 = 4.616086176499101,  score = 0.46
r = 10,  w0 = 3.5167547207932204,  score = 0.51
r = 100,  w0 = 0.6849263362652398,  score = 0.67


## question 5

In [60]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_scores = []

In [61]:
for seed in seeds:
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]].copy()
    df_val = df.iloc[n_train:n_train + n_val].copy()
    df_test = df.iloc[n_train + n_val:].copy()
    
    X_train = df_train[['ram', 'storage', 'screen']].fillna(0).values
    y_train = np.log1p(df_train.final_price.values)
    X_val =  df_val[['ram', 'storage', 'screen']].fillna(0).values
    y_val =  np.log1p(df_val.final_price.values)
    
    w0, w = train_linear_regression(X_train, y_train)
    
    y_pred = w0 + X_val.dot(w)
    
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)
    
    print(f"Seed: {seed}, RMSE: {round(score, 3)}")

Seed: 0, RMSE: 0.518
Seed: 1, RMSE: 0.517
Seed: 2, RMSE: 0.513
Seed: 3, RMSE: 0.515
Seed: 4, RMSE: 0.517
Seed: 5, RMSE: 0.517
Seed: 6, RMSE: 0.516
Seed: 7, RMSE: 0.516
Seed: 8, RMSE: 0.517
Seed: 9, RMSE: 0.51


In [63]:
std = np.std(rmse_scores)
std_rounded = round(std, 3)
print(f"\nStandard deviation of RMSE scores: {std_rounded}")


Standard deviation of RMSE scores: 0.002


## questin 6

In [64]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [65]:
seed = 9

idx = np.arange(n)
np.random.seed(seed)
np.random.shuffle(idx)

In [66]:
df_train = base.iloc[idx[:n_train]].copy()
df_val = base.iloc[idx[n_train:n_train + n_val]].copy()
df_test = base.iloc[idx[n_train + n_val:]].copy()

In [67]:
df_new = pd.concat([df_train, df_val], axis=0)
df_new.fillna(0, inplace=True)

In [68]:
X_new = df_new[['ram', 'storage', 'screen']].values
y_new = np.log1p(df_new.final_price.values)

In [69]:
r = 0.001
w0, w = train_linear_regression_reg(X_new, y_new, r=r)

In [70]:
df_test.fillna(0, inplace=True)  # Fill missing values in test set with 0
X_test = df_test[['ram', 'storage', 'screen']].values
y_test = np.log1p(df_test.final_price.values)