In [167]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Sklearn regression model evaluation function
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

pd.set_option('display.float_format', lambda x: '%.9f' % x)

In [168]:
click_data = pd.read_csv("../data/preprocessed_data.csv")

In [169]:
click_data.head()

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,clicks,district_popularity,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
0,3,3,4000.0,103.0,2550.0,46.0,150.0,1.568627451,562,2,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2000.0,58.0,1290.0,23.0,85.0,1.550387597,306,2,...,0,0,0,0,0,0,0,0,0,0
2,2,3,2385.0,73.0,1969.0,26.0,249.0,1.211274759,1468,2,...,0,0,0,0,0,0,0,0,0,0
3,6,2,1200.0,65.0,1350.0,11.0,0.0,0.888888889,0,2,...,0,1,0,0,0,0,0,0,0,0
4,2,2,2000.0,63.0,1950.0,0.0,100.0,1.025641026,0,2,...,0,0,0,0,0,0,0,1,0,0


In [170]:
X = click_data.drop("clicks", axis = 1)
y = click_data["clicks"]
# Rescale the data
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

# Convert X back to a Pandas DataFrame, for convenience
X = pd.DataFrame(rescaledX, index=X.index, columns=X.columns)

In [171]:
X.sample(5)

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
5840,0.0,0.136363636,0.033164746,0.006072874,0.006322758,0.022222222,0.005,0.098996656,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2696,0.020408163,0.090909091,0.103639831,0.015415758,0.032977523,0.140740741,0.0,0.07181677,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13018,0.0,0.045454545,0.028190034,0.006851448,0.006694685,0.066666667,0.008,0.080434783,0.333333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8602,0.040816327,0.045454545,0.052234475,0.001090003,0.008678296,0.148148148,0.02,0.120652174,0.666666667,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2969,0.020408163,0.090909091,0.082911865,0.004827157,0.014629127,0.133333333,0.006,0.121870883,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size)

In [182]:
model_xgb = xgb.XGBRegressor(n_estimators=1000, objective='count:poisson',max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [183]:
models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(), model_xgb]

In [184]:
for model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_train)
    print(type(model).__name__, mean_absolute_error(Y_train, predictions))

LinearRegression 407.6013613988309
KNeighborsRegressor 352.8385396369603
DecisionTreeRegressor 0.1675041876046901
XGBRegressor 98.38077366403084


In [185]:
for model in models:
    predictions = model.predict(X_test)
    print(type(model).__name__, mean_absolute_error(Y_test, predictions))

LinearRegression 244676701183.3598
KNeighborsRegressor 435.6972939217319
DecisionTreeRegressor 489.4034832084374
XGBRegressor 336.73609969999194


### Cross Validation

In [187]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model_metrics = []
for model in models:
    score = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    model_metrics.append((type(model).__name__, -1*round(score.mean(), 2), round(score.std(),2)))
model_metrics

[('LinearRegression', 16592232358.28, 89351903595.2),
 ('KNeighborsRegressor', 427.87, 23.85),
 ('DecisionTreeRegressor', 511.44, 37.61),
 ('XGBRegressor', 338.93, 24.49)]

This shows that the XGBoost Regressor is the best fit for this use case

In [178]:
model_metrics = pd.DataFrame(model_metrics, columns=["model", "avg_MAE", "std_MAE"]).round(2)
model_metrics

Unnamed: 0,model,avg_MAE,std_MAE
0,LinearRegression,-16592232358.28,89351903595.19998
1,KNeighborsRegressor,-427.87,23.85
2,DecisionTreeRegressor,-506.23,38.5
3,XGBRegressor,-423.92,23.36


In [188]:
predictions = model_xgb.predict(X_test)
df = X_test.copy()
df['Prediction'] = predictions
df['Actual'] = Y_test
df["Error"] = Y_test - predictions
df

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW,Prediction,Actual,Error
987,0.081632653,0.045454545,0.024873559,0.004204298,0.012273589,0.044444444,0.007000000,0.042708734,1.000000000,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,138.300598145,3,-135.300598145
9648,0.000000000,0.090909091,0.041455932,0.006072874,0.029518603,0.348148148,0.017000000,0.031905903,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,117.104507446,212,94.895492554
9257,0.102040816,0.136363636,0.124367797,0.007630022,0.016860689,0.281481481,0.000000000,0.160869565,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,101.610191345,0,-101.610191345
6021,0.102040816,0.090909091,0.243760882,0.008720025,0.022563569,0.103703704,0.000000000,0.241304348,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,15.891445160,0,-15.891445160
5662,0.020408163,0.090909091,0.145095763,0.009965743,0.023431398,0.281481481,0.000000000,0.138680660,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,33.077621460,0,-33.077621460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,0.040816327,0.045454545,0.024873559,0.006072874,0.006322758,0.074074074,0.008000000,0.074247492,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,43.956447601,1766,1722.043552399
7511,0.040816327,0.045454545,0.020727966,0.002491436,0.020567561,0.059259259,0.000000000,0.022355415,0.666666667,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,106.798789978,385,278.201210022
2180,0.040816327,0.090909091,0.124367797,0.006851448,0.018472372,0.051851852,0.015000000,0.148039477,1.000000000,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,1.000000000,0.000000000,0.000000000,156.915054321,715,558.084945679
1337,0.102040816,0.136363636,0.041455932,0.009187169,0.014988656,0.266666667,0.020000000,0.059625487,0.333333333,0.000000000,...,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,509.817901611,533,23.182098389


In [190]:
model_xgb.save_model("../models/click_predictor_xgb.pkl")