In [1]:
! pip install kaggle

! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!kaggle datasets download -d dgomonov/new-york-city-airbnb-open-data

Downloading new-york-city-airbnb-open-data.zip to /content
 82% 2.00M/2.44M [00:01<00:00, 2.22MB/s]
100% 2.44M/2.44M [00:01<00:00, 2.27MB/s]


In [3]:
!apt-get install unzip
!unzip new-york-city-airbnb-open-data.zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-25ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.
Archive:  new-york-city-airbnb-open-data.zip
  inflating: AB_NYC_2019.csv         
  inflating: New_York_City_.png      


# Data Preprocessing


In [4]:
import numpy as np 
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
matplotlib.rcParams['font.family'] = "Arial"

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
from plotly.subplots import make_subplots

init_notebook_mode(connected=True)

import collections
import itertools

import scipy.stats as stats
from scipy.stats import norm
from scipy.special import boxcox1p

import statsmodels
import statsmodels.api as sm
#print(statsmodels.__version__)

from sklearn.preprocessing import scale, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression, ElasticNet,  HuberRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.utils import resample

from xgboost import XGBRegressor

#Model interpretation modules
#mport eli5
#import lime
#import lime.lime_tabular
#import shap
#shap.initjs()

#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
data = pd.read_csv('/content/AB_NYC_2019.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [6]:
print('Number of features: {}'.format(data.shape[1]))
print('Number of examples: {}'.format(data.shape[0]))

Number of features: 16
Number of examples: 48895


In [7]:
#datetime transformation
data['last_review'] = pd.to_datetime(data['last_review'],infer_datetime_format=True) 

In [8]:
#clearing null values
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum())/data.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

Unnamed: 0,Total,Percent
last_review,10052,0.205583
reviews_per_month,10052,0.205583
host_name,21,0.000429
name,16,0.000327
id,0,0.0
host_id,0,0.0
neighbourhood_group,0,0.0
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0


In [9]:
#drop useless columns
data.drop(['host_name','name'], axis=1, inplace=True)
data[data['number_of_reviews']== 0.0].shape

(10052, 14)

In [10]:
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

In [11]:
earliest = min(data['last_review'])
data['last_review'] = data['last_review'].fillna(earliest)
data['last_review'] = data['last_review'].apply(lambda x: x.toordinal() - earliest.toordinal())

In [12]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum())/data.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

Unnamed: 0,Total,Percent
id,0,0.0
host_id,0,0.0
neighbourhood_group,0,0.0
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0
room_type,0,0.0
price,0,0.0
minimum_nights,0,0.0
number_of_reviews,0,0.0


In [13]:
data = data.drop(['host_id', 'id'], axis=1)

In [14]:
#transform numerical values to categorical ones
#because of unbalanced distribution

#data['minimum_nights'] = np.log1p(data['minimum_nights'])
data['all_year_avail'] = data['availability_365']>353
data['low_avail'] = data['availability_365']< 12
data['reviews_per_month'] = data[data['reviews_per_month'] < 17.5]['reviews_per_month']
data['no_reviews'] = data['reviews_per_month']==0
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

In [15]:
#one_hot_encoding for categorical features
categorical_features = data.select_dtypes(include=['object'])
print('Categorical features: {}'.format(categorical_features.shape))
categorical_features_one_hot = pd.get_dummies(categorical_features)
categorical_features_one_hot.head()


Categorical features: (48895, 3)


Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,neighbourhood_Allerton,neighbourhood_Arden Heights,neighbourhood_Arrochar,neighbourhood_Arverne,neighbourhood_Astoria,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
numerical_features =  data.select_dtypes(exclude=['object'])
#y = numerical_features.price 
y = np.log(numerical_features.price + 1)
numerical_features = numerical_features.drop(['price'], axis=1)
print('Numerical features: {}'.format(numerical_features.shape))

Numerical features: (48895, 11)


In [17]:
X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1)
X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1)
Processed_data = pd.concat([X_df, y], axis = 1)
Processed_data.to_csv('NYC_Airbnb_Processed.dat')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Dimensions of the training feature matrix: {}'.format(X_train.shape))
print('Dimensions of the training target vector: {}'.format(y_train.shape))
print('Dimensions of the test feature matrix: {}'.format(X_test.shape))
print('Dimensions of the test target vector: {}'.format(y_test.shape))

Dimensions of the training feature matrix: (39116, 240)
Dimensions of the training target vector: (39116,)
Dimensions of the test feature matrix: (9779, 240)
Dimensions of the test target vector: (9779,)


In [19]:
#scaler = RobustScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.fit_transform(X_test)

# XGBoost

In [20]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


In [21]:
n_folds = 5

# squared_loss
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state = 91).get_n_splits(numerical_features)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)

def rmse_lv_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state = 91).get_n_splits(numerical_features)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)

def rmse(actual,predicted):
    return(np.sqrt(mean_squared_error(actual, predicted)))

def model_scores(model, cv_model, y_train, y_test, pred_train, pred_test):
    mse = mean_squared_error(y_test, pred_test)
    results = pd.DataFrame({'Model':['{}'.format(type(model).__name__)],
                'CV error': '{:.3f}'.format(cv_model.mean()), 
                'CV std': '{:.3f}'.format(cv_model.std()),
                'RMSE train': [rmse(y_train, pred_train)],
                'RMSE test': [rmse(y_test, pred_test)],
                'R2 train': [r2_score(y_train, pred_train)],
                'R2 test': [r2_score(y_test, pred_test)]})
    results = results.round(decimals=4)

    return results

def model_scores_log(model, cv_model, y_train, y_test, pred_train, pred_test):
    mse = mean_squared_error(y_test, pred_test)
    results = pd.DataFrame({'Model':['{}'.format(type(model).__name__)],
                'CV error': '{:.3f}'.format(np.exp(cv_model.mean())), 
                'CV std': '{:.3f}'.format(np.exp(cv_model.std())),
                'RMSE train': [rmse((np.exp(y_train)-1), (np.exp(pred_train)-1))],
                'RMSE test': [rmse((np.exp(y_test)-1), (np.exp(pred_test)-1))],
                'R2 train': [r2_score((np.exp(y_train)-1), (np.exp(pred_train)-1))],
                'R2 test': [r2_score((np.exp(y_test)-1), (np.exp(pred_test)-1))]})
    results = results.round(decimals=4)

    return results

## baseline model


In [22]:
xbgreg_best = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, min_child_weight=1)
xbgreg_CV_best = -rmse_cv(xbgreg_best)
xbgreg_best.fit(X_train, y_train, early_stopping_rounds=10,eval_set=[(X_test, y_test)], verbose=False) 
y_train_xgbreg = xbgreg_best.predict(X_train)
y_test_xgbreg = xbgreg_best.predict(X_test)

xgb_best_results_log = model_scores(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)
xgb_best_results_back = model_scores_log(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)

In [23]:
xgb_best_results_log.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,0.194,0.006,0.4198,0.4431,0.6339,0.6009


In [24]:
xgb_best_results_back.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,1.214,1.006,215.9621,216.1649,0.2004,0.151


## hyperparameter-tuning : GridSearchCV


In [26]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 0.7, 1, 1.5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 10, 15],
        'n_estimators':range(70,100,10),
        'learning_rate': [0.1, 0.05, 0.01]
        }



In [31]:
estimator = XGBRegressor(n_estimators = 100)

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=params,
    scoring = 'r2',
    #n_jobs = 5,
    cv = 5,
    verbose=10
)

In [32]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
[CV 1/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 1/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.585 total time=   4.7s
[CV 2/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 2/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.599 total time=   3.4s
[CV 3/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 3/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.596 total time=   3.4s
[CV 4/5; 1/405] START gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70
[CV 4/5; 1/405] END gamma=0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=70;, score=0.593 total time=   4.8s
[CV 5/5; 1/405] STAR

In [34]:
grid_search.best_params_

{'gamma': 0.7,
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 5,
 'n_estimators': 90}

## best model


In [35]:
xbgreg_best = XGBRegressor(n_estimators=90, learning_rate=0.1, max_depth=10, min_child_weight=5, gamma=0.7)
xbgreg_CV_best = -rmse_cv(xbgreg_best)
xbgreg_best.fit(X_train, y_train, early_stopping_rounds=10,eval_set=[(X_test, y_test)], verbose=False) 
y_train_xgbreg = xbgreg_best.predict(X_train)
y_test_xgbreg = xbgreg_best.predict(X_test)


xgb_best_results_log = model_scores(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)
xgb_best_results_back = model_scores_log(xbgreg_best, xbgreg_CV_best, y_train, y_test, y_train_xgbreg, y_test_xgbreg)

xgb_best_results_log.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,0.189,0.007,0.3726,0.4354,0.7116,0.6147


In [36]:
xgb_best_results_back.style.hide_index()

Model,CV error,CV std,RMSE train,RMSE test,R2 train,R2 test
XGBRegressor,1.208,1.007,201.4745,214.6623,0.3041,0.1628
