# Set-ups & Import Data

In [1]:
! pip install kaggle

! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!kaggle datasets download -d dgomonov/new-york-city-airbnb-open-data

Downloading new-york-city-airbnb-open-data.zip to /content
 82% 2.00M/2.44M [00:01<00:00, 1.86MB/s]
100% 2.44M/2.44M [00:01<00:00, 1.91MB/s]


In [3]:
!apt-get install unzip
!unzip new-york-city-airbnb-open-data.zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-25ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.
Archive:  new-york-city-airbnb-open-data.zip
  inflating: AB_NYC_2019.csv         
  inflating: New_York_City_.png      


# Data Preprocessing (standardized)


In [4]:
import numpy as np 
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
matplotlib.rcParams['font.family'] = "Arial"

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
from plotly.subplots import make_subplots

init_notebook_mode(connected=True)

import collections
import itertools

import scipy.stats as stats
from scipy.stats import norm
from scipy.special import boxcox1p

import statsmodels
import statsmodels.api as sm
#print(statsmodels.__version__)

from sklearn.preprocessing import scale, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression, ElasticNet,  HuberRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.utils import resample

from xgboost import XGBRegressor

#Model interpretation modules
#mport eli5
#import lime
#import lime.lime_tabular
#import shap
#shap.initjs()

#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

In [62]:
data = pd.read_csv('/content/AB_NYC_2019.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [63]:
print('Number of features: {}'.format(data.shape[1]))
print('Number of examples: {}'.format(data.shape[0]))

Number of features: 16
Number of examples: 48895


In [64]:
#datetime transformation
data['last_review'] = pd.to_datetime(data['last_review'],infer_datetime_format=True) 

In [65]:
#checking null values
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum())/data.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

Unnamed: 0,Total,Percent
last_review,10052,0.205583
reviews_per_month,10052,0.205583
host_name,21,0.000429
name,16,0.000327
id,0,0.0
host_id,0,0.0
neighbourhood_group,0,0.0
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0


In [66]:
#drop useless columns
data.drop(['host_name','name'], axis=1, inplace=True)
data[data['number_of_reviews']== 0.0].shape

(10052, 14)

In [67]:
#Creating that binary column (new listing : Yes/No)
data["new_listing"] = 0
data["new_listing"][data["last_review"].isnull()] = 1

In [68]:
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

earliest = min(data['last_review'])
data['last_review'] = data['last_review'].fillna(earliest)
data['review_recency'] = (data.last_review.max() - data.last_review).dt.days
#data['last_review'] = data['last_review'].apply(lambda x: x.toordinal() - earliest.toordinal())

In [69]:
#transform numerical values to categorical ones
#because of unbalanced distribution

#data['minimum_nights'] = np.log1p(data['minimum_nights'])
data['all_year_avail'] = data['availability_365']>350
data['low_avail'] = data['availability_365']< 12

#data['reviews_per_month'] = data[data['reviews_per_month'] < 17.5]['reviews_per_month']
#data['no_reviews'] = data['reviews_per_month']==0
#data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

data['months'] = data['number_of_reviews']/data['reviews_per_month']
data['months'] = data['months'].fillna(0)

In [70]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum())/data.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

Unnamed: 0,Total,Percent
id,0,0.0
host_id,0,0.0
neighbourhood_group,0,0.0
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0
room_type,0,0.0
price,0,0.0
minimum_nights,0,0.0
number_of_reviews,0,0.0


In [71]:
data = data.drop(['host_id', 'id','last_review'], axis=1)

In [72]:
#one hot encoding room type & neighbourhood groups
data['type'] = 1
data_pivoted = data.pivot(columns='room_type',values='type').fillna(0)
data = data.join(data_pivoted)
data_pivoted = data.pivot(columns='neighbourhood_group',values='type').fillna(0)
data = data.join(data_pivoted)
data = data.drop(columns=['type','room_type','neighbourhood_group'])

#transform categorical neighbourhood into numeric ones
data['neighbourhood_numeric'] = pd.factorize(data['neighbourhood'])[0]
data = data.drop(columns=['neighbourhood'])

In [73]:
data.columns

Index(['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'new_listing', 'review_recency', 'all_year_avail',
       'low_avail', 'months', 'Entire home/apt', 'Private room', 'Shared room',
       'Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island',
       'neighbourhood_numeric'],
      dtype='object')

In [74]:
#one_hot_encoding for categorical features

#print('Categorical features: {}'.format(categorical_features.shape))
#categorical_features_one_hot = pd.get_dummies(categorical_features)
#categorical_features_one_hot.head()


In [75]:
#numerical_features =  data.select_dtypes(exclude=['object'])
#y = numerical_features.price 
#y = np.log(numerical_features.price + 1)
#numerical_features = numerical_features.drop(['price'], axis=1)
#print('Numerical features: {}'.format(numerical_features.shape))

In [76]:
#X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1)
#X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1)
Processed_data = data
Processed_data
Processed_data.to_csv('NYC_Airbnb_Processed.dat')

In [77]:
y = np.log(data.price + 1)
X = data.drop('price', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Dimensions of the training feature matrix: {}'.format(X_train.shape))
print('Dimensions of the training target vector: {}'.format(y_train.shape))
print('Dimensions of the test feature matrix: {}'.format(X_test.shape))
print('Dimensions of the test target vector: {}'.format(y_test.shape))

Dimensions of the training feature matrix: (39116, 21)
Dimensions of the training target vector: (39116,)
Dimensions of the test feature matrix: (9779, 21)
Dimensions of the test target vector: (9779,)


In [None]:
#scaler = RobustScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.fit_transform(X_test)

# XGBoost

In [79]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


In [80]:
n_folds = 5

# squared_loss
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state = 0)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)

def rmse(actual,predicted):
    return(np.sqrt(mean_squared_error(actual, predicted)))

def model_scores(model, cv_model, y_train, y_test, pred_train, pred_test):
    mse = mean_squared_error(y_test, pred_test)
    results = pd.DataFrame({'Model':['{}'.format(type(model).__name__)],
                'CV error': '{:.3f}'.format(cv_model.mean()), 
                'CV std': '{:.3f}'.format(cv_model.std()),
                'RMSE train': [rmse(y_train, pred_train)],
                'RMSE test': [rmse(y_test, pred_test)],
                'R2 train': [r2_score(y_train, pred_train)],
                'R2 test': [r2_score(y_test, pred_test)]})
    results = results.round(decimals=4)

    return results

def model_scores_log(model, cv_model, y_train, y_test, pred_train, pred_test):
    mse = mean_squared_error(y_test, pred_test)
    results = pd.DataFrame({'Model':['{}'.format(type(model).__name__)],
                'CV error': '{:.3f}'.format(np.exp(cv_model.mean())), 
                'CV std': '{:.3f}'.format(np.exp(cv_model.std())),
                'RMSE train': [rmse((np.exp(y_train)-1), (np.exp(pred_train)-1))],
                'RMSE test': [rmse((np.exp(y_test)-1), (np.exp(pred_test)-1))],
                'R2 train': [r2_score((np.exp(y_train)-1), (np.exp(pred_train)-1))],
                'R2 test': [r2_score((np.exp(y_test)-1), (np.exp(pred_test)-1))]})
    results = results.round(decimals=4)

    return results

## baseline model


In [81]:
xbgreg_best = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, min_child_weight=1)
xbgreg_CV_best = -rmse_cv(xbgreg_best)
xbgreg_best.fit(X_train, y_train, early_stopping_rounds=10,eval_set=[(X_test, y_test)], verbose=False) 
y_train_xgbreg = xbgreg_best.predict(X_train)
y_test_xgbreg = xbgreg_best.predict(X_test)

xgb_best_results_log = model_scores(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)
xgb_best_results_back = model_scores_log(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)

In [82]:
xgb_best_results_log.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,0.193,0.008,0.4183,0.4403,0.6364,0.606


In [83]:
xgb_best_results_back.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,1.213,1.008,217.1833,215.4464,0.1914,0.1566


## hyperparameter-tuning : GridSearchCV


In [None]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 0.7, 1, 1.5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 10, 15],
        'n_estimators':range(70,100,10),
        'learning_rate': [0.1, 0.05, 0.01]
        }



In [None]:
estimator = XGBRegressor(n_estimators = 100)

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=params,
    scoring = 'r2',
    #n_jobs = 5,
    cv = 5,
    verbose=10
)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
[CV 1/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 1/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.585 total time=   4.7s
[CV 2/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 2/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.599 total time=   3.4s
[CV 3/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 3/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.596 total time=   3.4s
[CV 4/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 4/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.593 total time=   4.8s
[CV 5/5; 1/405] STAR

In [None]:
grid_search.best_params_

{'gamma': 0.7,
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 5,
 'n_estimators': 90}

## best model


In [84]:
xbgreg_best = XGBRegressor(n_estimators=90, learning_rate=0.1, max_depth=10, min_child_weight=5, gamma=0.7)
xbgreg_CV_best = -rmse_cv(xbgreg_best)
xbgreg_best.fit(X_train, y_train, early_stopping_rounds=10,eval_set=[(X_test, y_test)], verbose=False) 
y_train_xgbreg = xbgreg_best.predict(X_train)
y_test_xgbreg = xbgreg_best.predict(X_test)


xgb_best_results_log = model_scores(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)
xgb_best_results_back = model_scores_log(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)

xgb_best_results_log.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,0.188,0.008,0.3546,0.4346,0.7387,0.6162


In [85]:
xgb_best_results_back.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,1.207,1.008,193.2525,213.4161,0.3598,0.1725
