In [208]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [209]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn  as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# TASK
* The goal of this homework is to create a regression model for prediction apartment prices (column 'price').

## EDA
### Import Data

In [210]:
# Load the data
columns=['latitude','longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365'] 
df=pd.read_csv('../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv', usecols=columns)

df.head()

df.shape

In [211]:
df.head(10)

In [212]:
sns.histplot(df['price'], bins=50)

The `price` variable has a long tail from the plot above.

# Question 1
* Find a feature with missing values. How many missing values does it have?

In [213]:
df.isnull().sum()

missing=df.isnull().mean()*100
missing[missing>0]

The `reviews_per_month` feature has got 10052 missing values.

### Duplicates

In [214]:
df.duplicated().value_counts() # there are no duplicates

# Question 2
* What's the median (50% percentile) for variable 'minimum_nights'?
## Split the data
* Shuffle the initial dataset, use seed 42.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Make sure that the target value ('price') is not in your dataframe.
* Apply the log transformation to the price variable using the np.log1p() function.

In [215]:
df.minimum_nights.median()

### Descriptive Statistics

We can also use the `describe` function to determine the median

In [216]:
df.describe(percentiles=[.0,.25,.5, .75,.9, .95, .99,.1]).T



As seen the median of the feature is 3.0

## Data Visualization

In [217]:
fig= plt.figure(figsize=(15,10),dpi=100, facecolor="white", edgecolor="red")

ax=plt.gca()

df.hist(bins=100, ax=ax, layout=(3, 3), column=['price','minimum_nights', 'number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365' ], color='blue')
plt.tight_layout()

plt.show

This data is not distributed normally. It is quite skewed. Therefore we do the log transformation.

In [218]:
sns.distplot(df.price)

In [219]:
sns.distplot(np.log1p(df.price))

It turns into a more normal distribution.

Let us first create a linear regression function. Get the weights and biases

In [220]:
def train_linear_regression(X,y):
    ones=np.ones(X.shape[0])
    X=np.column_stack([ones, X])
    
    XTX=X.T.dot(X)
    XTX_inv=np.linalg.inv(XTX)
    w=XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]
    

## DataSet Preparation

In [221]:
n= len(df)

n_val=int(n*0.2)

n_test=int(n*.2)

n_train=n- (n_val+ n_test)

idx=np.arange(n)

np.random.seed(42)

np.random.shuffle(idx)

df_shuffled=df.iloc[idx]

In [222]:
df_train=df_shuffled.iloc[:n_train].copy()
df_val=df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test=df_shuffled.iloc[n_val+n_train:].copy()


df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

y_train_orig=df_train.price.values
y_val_orig=df_val.price.values
y_test_orig=df_test.price.values

y_train=np.log1p(y_train_orig)
y_val=np.log1p(y_val_orig)
y_test=np.log1p(y_test_orig)

df.head(10)
y_train

In [223]:
del df_train['price']
del df_val['price']
del df_test['price']

# Question 3
* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using round(score, 2)
* Which option gives better RMSE?

In [224]:
base=['latitude','longitude',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']


def prepare_X(df, fillna_value):
    
    df_num=df[base]
    
    df_num=df_num.fillna(fillna_value)
    
    X=df_num.values
    
    return X

In [225]:
def rmse (y_pred, y):
    error=y_pred-y
    mse=(error ** 2).mean()
    
    return np.sqrt(mse)

In [226]:
mean=df_train['reviews_per_month'].mean()

X_mean_train=prepare_X(df_train, fillna_value=mean)

w0_mean, w_mean=train_linear_regression(X_mean_train, y_train)

In [227]:
X_mean_val=prepare_X(df_val, fillna_value=mean)

y_mean_pred_val=w0_mean + X_mean_val.dot(w_mean)

y_mean_pred_val

In [228]:
np.round(rmse(y_mean_pred_val, y_val),2) #Using the mean

Now using 0 so that we can compare

In [229]:
fillna_value=0

X_0_train=prepare_X(df_train, fillna_value=0)

w0_0, w_0=train_linear_regression(X_0_train, y_train)

In [230]:
X_0_val=prepare_X(df_val, fillna_value=0)

y_0_pred_val=w0_0 + X_0_val.dot(w_0)



In [231]:
np.round(rmse(y_0_pred_val, y_val),2)

## Answer 3: They are similar.

# Question 4
* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0.
* Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which r gives the best RMSE?

In [232]:
def train_linear_regression(X, y, r=0.0):
    ones=np.ones(X.shape[0])
    X=np.column_stack([ones, X])
    
    XTX=X.T.dot(X)
    XTX= XTX + r * np.eye(XTX.shape[0])
    XTX_inv=np.linalg.inv(XTX)
    w=XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]
    

In [233]:
# Loop over a list  of r's to determine the best value

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    
    w_0, w= train_linear_regression(X_0_train, y_train, r=r)
    
    y_0_reg_val=w_0 + X_0_val.dot(w)
    
    score=np.round(rmse(y_0_reg_val, y_val),2)
    
    print(f"For {r} value the bias term is {w_0:.4f} and  rsme is {score}")
    


We can chose to fill the null values with O or the mean of the values.

## Answer4: 0 gives the best RMSE

# Question 5
* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (round(std, 3))
>Note: Standard deviation shows how different the values are. If it's low, then all values are approximately the same. If it's high, the values are different. If standard deviation of scores is low, then our model is stable.

## Seed Checking

In [234]:
rmse_list=[] # we will poplulate this list with rmse values


for s in  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    
    idx=np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)
    
    
    df_shuffled=df.iloc[idx]
    df_train=df_shuffled.iloc[:n_train].copy()
    df_val=df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test=df_shuffled.iloc[n_val+n_train:].copy()
    
    df_train=df_train.reset_index(drop=True)
    df_val=df_val.reset_index(drop=True)
    df_test=df_test.reset_index(drop=True)

    y_train_orig=df_train.price.values
    y_val_orig=df_val.price.values
    y_test_orig=df_test.price.values

    y_train=np.log1p(y_train_orig)
    y_val=np.log1p(y_val_orig)
    y_test=np.log1p(y_test_orig)
    
    del df_train['price']
    del df_val['price']
    del df_test['price']
    
    
    X_0_train=prepare_X(df_train, fillna_value=0)
    
    w_0, w=train_linear_regression(X_0_train, y_train)
    
    X_0_val=prepare_X(df_val, fillna_value=0)
    
    y_0_val_reg=w_0 +X_0_val.dot(w)
    
    
    rmse_scores=np.round(rmse(y_0_val_reg, y_val),2)
    
    rmse_list.append(rmse_scores)
    
    
    print(s, w_0  , rmse_scores )

In [235]:
# The standard deviation rounded to 3 decimal places

np.round(np.std(rmse_list), 3)

##### Answer 5: The standard deviation is .008

# Question 6
* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with r=0.001.
* What's the RMSE on the test dataset?

### Seed and Regularization

In [236]:
s=9

idx=np.arange(n)

np.random.seed(s)

np.random.shuffle(idx)

df_shuffled=df.iloc[idx]

df_train=df_shuffled.iloc[:n_train].copy()
df_val=df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test=df_shuffled.iloc[n_val+n_train:].copy()


df_full=[df_train,df_val]

df_full=pd.concat(df_full)

df_full=df_full.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

y_train_val_orig=df_full.price.values
y_test_orig=df_test.price.values


y_train_val=np.log1p(y_train_val_orig)
y_test=np.log1p(y_test_orig)


del df_full['price']
del df_test['price']


In [237]:
X_0_full=prepare_X(df_full, fillna_value=0)

w_0_full, w_full=train_linear_regression(X_0_full, y_train_val, r=.001)


X_0_test=prepare_X(df_test, fillna_value=0)
y_0_pred_test= w_0_full + X_0_test.dot(w_full)




np.round(rmse(y_test,y_0_pred_test), 2)

In [238]:
home=df_test.iloc[5609].to_dict()
home

In [239]:
df_small=pd.DataFrame([home])

In [240]:
X_small=prepare_X(df_small, fillna_value=0)


y_pred=w_0_full + X_small.dot(w_full) 


y_pred=y_pred[0]

y_pred

In [241]:
np.expm1(y_pred)

In [242]:
np.expm1(y_test[5609])

#### Answer no 6: is 0.65