In [1]:
import pandas as pd
import numpy as np

### Data preparation

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

In [3]:
!wget $data -O AB_NYC_2019.csv

--2021-09-19 17:36:39--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv’


2021-09-19 17:36:40 (12.4 MB/s) - ‘AB_NYC_2019.csv’ saved [7077973/7077973]



In [4]:
df = pd.read_csv('AB_NYC_2019.csv')

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['name',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'room_type',
 'last_review']

In [7]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [8]:
df = df[['latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']]


### Question 1  
  
__Find a feature with missing values. How many missing values does it have?__

In [9]:
df.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

### Question 2  
  
__What's the median (50% percentile) for variable 'minimum_nights'?__  
 
- Split the data  
- Shuffle the initial dataset, use seed 42.  
- Split your data in train/val/test sets, with 60%/20%/20% distribution.  
- Make sure that the target value ('price') is not in your dataframe.  
- Apply the log transformation to the price variable using the np.log1p() function.

In [10]:
def split_dataset(df, seed):
    n = len(df)

    n_val = int(n * 0.2)
    n_test = n_val
    n_train = n - n_val * 2
    
    idx = np.arange(n)
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = np.log1p(df_train.price.values)
    y_val = np.log1p(df_val.price.values)
    y_test = np.log1p(df_test.price.values)
    
    del df_train['price']
    del df_val['price']
    del df_test['price']
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [11]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, 42)

In [12]:
len(y_train)

29337

*Stats __original__ dataframe*

In [13]:
df.minimum_nights.describe()

count    48895.000000
mean         7.029962
std         20.510550
min          1.000000
25%          1.000000
50%          3.000000
75%          5.000000
max       1250.000000
Name: minimum_nights, dtype: float64

*Stats __train__ dataframe*

In [14]:
df_train.minimum_nights.describe()

count    29337.000000
mean         6.989740
std         21.014583
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max       1000.000000
Name: minimum_nights, dtype: float64

*Stats __validation__ dataframe*

In [15]:
df_val.minimum_nights.describe()

count    9779.000000
mean        7.102260
std        21.384159
min         1.000000
25%         1.000000
50%         3.000000
75%         5.000000
max      1250.000000
Name: minimum_nights, dtype: float64

### Question 3  
  
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

In [16]:
def train_linear_regression(Xp, yp):
    onesi = np.ones(Xp.shape[0])
    Xi = np.column_stack([onesi, Xp])
    #normal equation +++
    XTXi = Xi.T.dot(Xi)
    XTX_invi = np.linalg.inv(XTXi)
    w_fulli = XTX_invi.dot(Xi.T).dot(yp)
    #normal equation ---
    return w_fulli[0], w_fulli[1:]

In [17]:
def rmse(yp, y_predp):
    se = (yp - y_predp) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [18]:
def prepare_X_fill_with_zero(df, base):
    dfi = df.copy() # copying data frame to not change original    
    features = base.copy()

    df_num = dfi[features] #extracting base training values
    df_num = df_num.fillna(0) # filling the missing values
    Xi = df_num.values # extracting feature matrix
    return Xi

In [19]:
def prepare_X_fill_with_mean(df, base, mean_dict):
    dfi = df.copy() # copying data frame to not change original    
    features = base.copy()

    df_num = dfi[features] #extracting base training values
    
    for k in mean_dict.keys():
        df_num[k] = df_num[k].fillna(mean_dict[k]) # filling the missing values
    Xi = df_num.values # extracting feature matrix
    return Xi

*Preparing datasets*

In [20]:
base = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365']

base_mean = ['reviews_per_month']

In [21]:
def prepare_mean(df, base_mean):
    result = {}
    
    for m in base_mean:
        result[m] = df[m].mean()
    
    return result

In [22]:
mean_o = prepare_mean(df_train, base_mean)

X_train_mean = prepare_X_fill_with_mean(df_train, base, mean_o);
X_train_zero = prepare_X_fill_with_zero(df_train, base);

*Training the models*

In [23]:
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train) # training the model for zeros
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train) # training the model for mean

In [24]:
w0_zero, w0_mean

(-419.91265872315813, -423.53930828791135)

*Validation*

In [25]:
X_val_zero = prepare_X_fill_with_zero(df_val, base) # getting validation feature matrix for zero
y_pred_zero = w0_zero + X_val_zero.dot(w_zero)
score_zero = rmse(y_val, y_pred_zero)

X_val_mean = prepare_X_fill_with_mean(df_val, base, mean_o) # getting validation feature matrix for mean
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)
score_mean = rmse(y_val, y_pred_mean)


print(w0_zero, w0_mean, score_zero.round(2), score_mean.round(2))

-419.91265872315813 -423.53930828791135 0.64 0.64


### Question 4  
  
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

In [26]:
def train_linear_regression_reg(Xp, yp, r =0.001):
    onesi = np.ones(Xp.shape[0])
    Xi = np.column_stack([onesi, Xp])
    #normal equation +++
    XTXi = Xi.T.dot(Xi)
    XTXi = XTXi + r * np.eye(XTXi.shape[0])
    XTX_invi = np.linalg.inv(XTXi)
    w_fulli = XTX_invi.dot(Xi.T).dot(yp)
    #normal equation ---
    return w_fulli[0], w_fulli[1:]

In [27]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_X_fill_with_zero(df_train, base) # getting training feature matrix
    w0, w = train_linear_regression_reg(X_train, y_train, r) # training the model

    X_val = prepare_X_fill_with_zero(df_val, base) # getting validation feature matrix
    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

0 -419.91265872315813 0.643033778856463
1e-06 -419.8627156612891 0.643034115739894
0.0001 -414.97649241546526 0.6430723153709029
0.001 -375.2736526945097 0.6437669735316532
0.01 -191.7838405121791 0.6557528427465624
0.1 -32.56256055279646 0.6773299642454608
1 -3.499216837025976 0.6823116950160037
5 -0.7033623164576596 0.6827915727842253
10 -0.35127676049422046 0.6828430212097085


### Question 5
  
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [28]:
def try_different_seeds(seeds):
    result = []
    for seed in seeds:
        df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, seed)
        X_train = prepare_X_fill_with_zero(df_train, base) # getting training feature matrix
        w0, w = train_linear_regression(X_train, y_train) # training the model

        X_val = prepare_X_fill_with_zero(df_val, base) # getting validation feature matrix
        y_pred = w0 + X_val.dot(w)

        score = rmse(y_val, y_pred)
        result.append(score)
    return result
    

In [29]:
rmse_results = try_different_seeds([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [30]:
round(np.std(rmse_results), 3)

0.008

### Question 6
  
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

In [31]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, 9)

df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

y_full_train = np.concatenate([y_train, y_val])

X_full_train = prepare_X_fill_with_zero(df_full_train, base) # getting training feature matrix
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001) # training the model

X_test = prepare_X_fill_with_zero(df_test, base) # getting validation feature matrix
y_pred = w0 + X_test.dot(w)

score = rmse(y_test, y_pred)

print(r, w0, score)


10 -389.1162946303787 0.6452771348507003
