<a href="https://colab.research.google.com/github/KenWuqianghao/ML-Zoomcamp/blob/main/Week_2_Homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

df = pd.read_csv(url)

In [37]:
df = df[['latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']]

In [38]:
df.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [39]:
df['minimum_nights'].median()

3.0

In [40]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [41]:
features = ['latitude',
'longitude',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

df_train_0 = df_train[features].fillna(0)
df_train_mean = df_train[features].fillna(df_train['reviews_per_month'].mean())

In [42]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [43]:
w0_0, w_0 = train_linear_regression(df_train_0, y_train)
w0_mean, w_mean = train_linear_regression(df_train_mean, y_train)
y_pred_0 = w0_0 + df_val.dot(w_0)
y_pred_mean = w0_mean + df_val.dot(w_mean)
print(rmse(y_val,y_pred_0))
print(rmse(y_val,y_pred_mean))

0.6400458838304843
0.6416214641032572


In [44]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]
  
def prepare_X(df):
  df_num = df[features]
  df_num = df_num.fillna(0)
  X = df_num.values
  return X

In [45]:
r_list = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

for r in r_list:
  X_train = prepare_X(df_train)
  w0, w = train_linear_regression_reg(X_train, y_train, r=r)

  X_val = prepare_X(df_val)
  y_pred = w0 + X_val.dot(w)
  score = rmse(y_val, y_pred)
  print(score)

0.6643750661225449
0.6643720183403375
0.6640847616777706
0.6625228986983352
0.6699349384607725
0.6897512096294331
0.6942148292190962
0.6945858039785532
0.6945702378860177


In [55]:
scores = []

for i in range (0,10):
  df = df.copy()

  n = len(df)
  n_val = int(n * 0.2)
  n_test = int(n * 0.2)
  n_train = n - n_val - n_test

  df_train = df.iloc[:n_train]
  df_val = df.iloc[n_train:n_train+n_val]
  df_test = df.iloc[n_train+n_val:]

  idx = np.arange(n)
  np.random.seed(i)
  np.random.shuffle(idx)

  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  df_test = df_test.reset_index(drop=True)

  y_train = np.log1p(df_train.price.values)
  y_val = np.log1p(df_val.price.values)
  y_test = np.log1p(df_test.price.values)

  del df_train['price']
  del df_val['price']
  del df_test['price']

  df_train_0 = df_train[features].fillna(0)
  w0_0, w_0 = train_linear_regression(df_train_0, y_train)
  y_pred_0 = w0_0 + df_val.dot(w_0)
  scores.append(rmse(y_val,y_pred_0))

In [57]:
np.std(scores)

0.0

In [58]:
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X(df_full_train)
y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.7014439701366145