In [1]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv("data/laptops.csv")
df.columns = df.columns.str.lower().str.replace(' ', '_')
df

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


### Question 1

There's one column with missing values. What is it?

* `'ram'`
* `'storage'`
* `'screen'`
* `'final_price'`


In [24]:
cols = ['ram', 'storage', 'screen', 'final_price']
for col in cols:
    if df[col].isnull().sum() > 0:
        print(col)

screen


### Question 2

What's the median (50% percentile) for variable `'ram'`?

- 8
- 16
- 24
- 32

In [25]:
print(df['ram'].median())
print(df['ram'].quantile(0.5))

16.0
16.0


### Prepare and split the dataset

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.

Use the same code as in the lectures

In [26]:
df = df[cols]
n = len(df)
n_val = int(n * 0.2) # 20
n_test = int(n * 0.2) # 20
n_train = n - n_val - n_test # 60
print(n_val, n_test, n_train)
df.iloc[[10, 0, 3, 5]]

432 432 1296


Unnamed: 0,ram,storage,screen,final_price
10,8,256,15.6,349.0
0,8,512,15.6,1009.0
3,16,1000,15.6,1199.0
5,32,1000,17.3,1699.0


In [27]:
idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train.head()

Unnamed: 0,ram,storage,screen,final_price
2079,32,1000,15.6,1123.29
668,4,64,14.1,201.05
2073,32,1000,14.0,997.74
1113,16,512,13.3,1016.0
788,32,1000,16.0,2739.0


In [28]:
len(df_train), len(df_val), len(df_test)

(1296, 432, 432)

In [29]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

In [31]:
del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [32]:
len(y_train)

1296

### Question 3

* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good

In [33]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [34]:
def prepare_X(df, na_val):
    df = df.fillna(na_val)
    X = df.values
    return X

In [35]:
def training(df_train, y_train, na_val):
    X_train = prepare_X(df_train, na_val)
    w0, w = train_linear_regression(X_train, y_train)
    return w0, w

def predict(df_test, w0, w, na_val):
    X_test = prepare_X(df_test, na_val)
    y_pred = w0 + X_test.dot(w)
    return y_pred

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def scoring(df_train, y_train, df_val, y_val, na_val):
    # Train
    w0, w = training(df_train, y_train, na_val)
    # Validation
    y_pred_val = predict(df_val, w0, w, na_val)
    score_zero = round(rmse(y_val, y_pred_val), 2)
    print("score_zero: ", score_zero)
    return score_zero


In [36]:
# Using 0
na_val = 0
scoring(df_train, y_train, df_val, y_val, na_val)


score_zero:  597.36


597.36

In [37]:
# Using mean value
na_val = df_train.mean()
scoring(df_train, y_train, df_val, y_val, na_val)

score_zero:  600.27


600.27

### Question 4

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0. 
* Try different values of `r` from this list: `[0, 0.01, 0.1, 1, 5, 10, 100]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

If there are multiple options, select the smallest `r`.

Options:

- 0
- 0.01
- 1
- 10
- 100

In [38]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [39]:
def scoring_reg(df_train, y_train, df_val, y_val, r, na_val):
    X_train = prepare_X(df_train, na_val)
    w0, w = train_linear_regression_reg(X_train, y_train, r)
    
    X_val = prepare_X(df_val, na_val)
    y_pred = w0 + X_val.dot(w)
    return round(rmse(y_val, y_pred), 2)

In [40]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    score = scoring_reg(df_train, y_train, df_val, y_val, r, 0)
    print("r: ", r, "score: ", score)

r:  0 score:  597.36
r:  0.01 score:  597.36
r:  0.1 score:  597.35
r:  1 score:  597.21
r:  5 score:  597.01
r:  10 score:  597.06
r:  100 score:  597.9


### Question 5 

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

What's the value of std?

- 19.176
- 29.176
- 39.176
- 49.176

> Note: Standard deviation shows how different the values are.
> If it's low, then all values are approximately the same.
> If it's high, the values are different. 
> If standard deviation of scores is low, then our model is *stable*.

In [41]:
def split_dataset(df, s):
    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = (df_train.final_price.values)
    y_val = (df_val.final_price.values)
    y_test = (df_test.final_price.values)
    
    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [42]:
def q5_scoring(s):
    df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, s)
    return scoring(df_train, y_train, df_val, y_val, 0)

scores = [q5_scoring(s) for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
    
print(round(np.std(scores), 3))

score_zero:  565.45
score_zero:  636.8
score_zero:  588.96
score_zero:  597.81
score_zero:  571.96
score_zero:  573.24
score_zero:  647.34
score_zero:  550.44
score_zero:  587.33
score_zero:  576.1
29.176


### Question 6

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

Options:

- 598.60
- 608.60
- 618.60
- 628.60

In [43]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, 9)

df_comb = pd.concat([df_train, df_val])
print(type(df_train), type(df_val))
print(type(y_train), type(y_val))
y_comb = np.concatenate([y_train, y_val])

scoring_reg(df_comb, y_comb, df_test, y_test, 0.001, 0)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


608.61