In [None]:
!pip install category_encoders --quiet
!pip install shap --quiet 
!pip install catboost --quiet

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import category_encoders
from catboost import CatBoostRegressor

  import pandas.util.testing as tm


In [None]:
print(xgboost.__version__)
print(sklearn.__version__)
print(catboost.__version__)
print(category_encoders.__version__)

In [3]:
# loading the dataset and filtering out rows where price is 0 -- missed in clean

df = pd.read_csv('/content/drive/MyDrive/nyc_data/model_df.csv',
                 parse_dates = ['date'],
                 index_col = 'date')

df = df[df['price'] >0].copy()

In [4]:
# adding the days until booking feature to the dataset

date_of_scrape = pd.to_datetime('2021-04-07')

df['days_until_booking'] = df.index - date_of_scrape
df['days_until_booking'] = (df['days_until_booking']
                            .astype(str)
                            .str.rstrip(' days')                            
                            .astype(int))

In [5]:
# features selected for user-friendliness and predictive utility

features = ['borough', 'room_type', 'accommodates','day_of_week',
            'days_until_booking']

target = 'price'

date_threshold = '2022-02-02'

mask = df.index < date_threshold

X = df[features]
y = df[target]

X_train, y_train = X[mask], y[mask]
X_test, y_test = X[~mask], y[~mask]

In [10]:
# creating a smaller dataset through sampling to train the models

df_sample = df.sample(frac = 0.1)

mask_sample = df_sample.index < date_threshold

X_sample = df_sample[features]
y_sample = df_sample[target]

Xs_train, ys_train = X_sample[mask_sample], y_sample[mask_sample]
Xs_test, ys_test = X_sample[~mask_sample], y_sample[~mask_sample]

In [None]:
# calculating the baseline error when using a naive model of the average

y_train_mean = [y_train.mean()] * len(y_train)

ys_train_mean = [ys_train.mean()] * len(ys_train)

print("The mean absolute error of a naive model using only the average is:", mean_absolute_error(y_train, y_train_mean))
print("The mean absolute error of a naive model on the subsample dataset is:", mean_absolute_error(ys_train, ys_train_mean))

The mean absolute error of a naive model using only the average is: 100.75608731408619
The mean absolute error of a naive model on the subsample dataset is: 100.2419722786472


In [None]:
# training a CatBoost model with the loss function set to quantile to 
# predict the 50th percentile price. below we train 75th and 25th percentile
# models. the user can decide where to price their unit. 

model = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    CatBoostRegressor(loss_function = 'Quantile:alpha=0.5')
)

model.fit(Xs_train, ys_train);

In [None]:
# training a CatBoost model to predict the 75th percentile price 

model_cat75 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    CatBoostRegressor(loss_function = 'Quantile:alpha=0.75')

)

model_cat75.fit(Xs_train, ys_train);

In [None]:
# training a CatBoost model to predict the 25th percentile price

model_cat25 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    CatBoostRegressor(loss_function = 'Quantile:alpha=0.25')

)

model_cat25.fit(Xs_train, ys_train);

In [None]:
model_cat.named_steps['catboostregressor'].feature_importances_

In [22]:
cat_boost50 = '/content/drive/MyDrive/nyc_data/catboost_50b.sav'
pickle.dump(model, open(cat_boost50, "wb"))

In [23]:
catboost_75 = '/content/drive/MyDrive/nyc_data/catboost_75b.sav'
pickle.dump(model_cat75, open(catboost_75, "wb"))

In [None]:
catboost_25 = '/content/drive/MyDrive/nyc_data/catboost_25.sav'
pickle.dump(model_cat25, open(catboost_25, "wb"))

In [17]:
test_pred = {
  "borough": "Manhattan",
  "room_type": "room",
  "accommodates": 4,
  "day_of_week": 6,
  "days_until_booking": 7
}

In [18]:
test_pred_df = pd.DataFrame(test_pred, index = [0])

In [21]:
print(model.predict(test_pred_df))
print(model_cat75.predict(test_pred_df))

[124.72247899]
[189.12321627]
