In [30]:
!pip install category_encoders --quiet
!pip install shap --quiet 
!pip install scikit-garden --quiet
!pip install catboost --quiet

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from skgarden import RandomForestQuantileRegressor

In [22]:
df = pd.read_csv('/content/drive/MyDrive/nyc_data/model_df.csv',
                 parse_dates = ['date'],
                 index_col = 'date')

df = df[df['price'] >0].copy()

In [23]:
date_of_scrape = pd.to_datetime('2021-04-07')

df['days_until_booking'] = df.index - date_of_scrape
df['days_until_booking'] = (df['days_until_booking']
                            .astype(str)
                            .str.rstrip(' days')                            
                            .astype(int))

In [24]:
# features selected for user-friendliness + predictive utility

features = ['borough', 'room_type', 'accommodates','day_of_week',
            'days_until_booking']

target = 'price'

date_threshold = '2022-02-02'

mask = df.index < date_threshold

X = df[features]
y = df[target]

X_train, y_train = X[mask], y[mask]
X_test, y_test = X[~mask], y[~mask]

In [25]:
# using a smaller database to train the model

df_sample = df.sample(frac = 0.1)

mask_sample = df_sample.index < date_threshold

X_sample = df_sample[features]
y_sample = df_sample[target]

Xs_train, ys_train = X_sample[mask_sample], y_sample[mask_sample]
Xs_test, ys_test = X_sample[~mask_sample], y_sample[~mask_sample]

In [None]:
y_train_mean = [y_train.mean()] * len(y_train)

ys_train_mean = [ys_train.mean()] * len(ys_train)

print("The mean absolute error of a naive model using only the average is:", mean_absolute_error(y_train, y_train_mean))
print("The mean absolute error of a naive model on the subsample dataset is:", mean_absolute_error(ys_train, ys_train_mean))

In [27]:
# training an XGBoostRegressor to predict the mean expected price given features

model = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBRegressor()
)

model.fit(Xs_train, ys_train);



In [None]:
# training a CatBoostRegressor model to predict 90th percentile price given features

model_cat = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    CatBoostRegressor(loss_function = 'Quantile:alpha=0.90')

)

model_cat.fit(Xs_train, ys_train);

In [None]:
model_cat10 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    CatBoostRegressor(loss_function = 'Quantile:alpha=0.10')

)

model_cat10.fit(Xs_train, ys_train);

In [37]:
model_cat.named_steps['catboostregressor'].feature_importances_

array([29.52623607, 15.26015654, 52.60043851,  0.97515577,  1.63801311])

In [28]:
boost_regressor = '/content/drive/MyDrive/nyc_data/boost_model.sav'
pickle.dump(model, open(boost_regressor, "wb"))

In [42]:
catboost_90 = '/content/drive/MyDrive/nyc_data/catboost_90.sav'
pickle.dump(model_cat, open(catboost_90, "wb"))

In [45]:
catboost_10 = '/content/drive/MyDrive/nyc_data/catboost_10.sav'
pickle.dump(model_cat10, open(catboost_10, "wb"))

In [47]:
test_pred90 = model_cat.predict(Xs_test)

In [51]:
test_pred10 = model_cat10.predict(Xs_test)

In [55]:
test_predmean = model.predict(Xs_test)