In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Sklearn regression model evaluation function
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import pickle

pd.set_option('display.float_format', lambda x: '%.9f' % x)

In [2]:
click_data = pd.read_csv("../data/preprocessed_data.csv")

In [3]:
click_data.head()

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,clicks,district_popularity,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
0,3,3,4000.0,103.0,2550.0,46.0,150.0,1.568627451,562,2,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2000.0,58.0,1290.0,23.0,85.0,1.550387597,306,2,...,0,0,0,0,0,0,0,0,0,0
2,2,3,2385.0,73.0,1969.0,26.0,249.0,1.211274759,1468,2,...,0,0,0,0,0,0,0,0,0,0
3,6,2,1200.0,65.0,1350.0,11.0,0.0,0.888888889,0,2,...,0,1,0,0,0,0,0,0,0,0
4,2,2,2000.0,63.0,1950.0,0.0,100.0,1.025641026,0,2,...,0,0,0,0,0,0,0,1,0,0


In [4]:
X = click_data.drop("clicks", axis = 1)
y = click_data["clicks"]
# Rescale the data
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

# Convert X back to a Pandas DataFrame, for convenience
X = pd.DataFrame(rescaledX, index=X.index, columns=X.columns)

In [22]:
X.columns

Index(['min_booking_duration', 'rooms', 'deposit', 'area', 'price',
       'number_of_pics', 'cleaning_fee', 'deposit_ratio',
       'district_popularity', 'first_pic_category_BALCONY',
       'first_pic_category_BATHROOM', 'first_pic_category_DETAILS',
       'first_pic_category_DINING_ROOM', 'first_pic_category_EMPTY_ROOM',
       'first_pic_category_ENERGY_CERTIFICATE',
       'first_pic_category_FLOOR_PLAN', 'first_pic_category_GARDEN',
       'first_pic_category_GYM', 'first_pic_category_HALL_CORRIDOR',
       'first_pic_category_KITCHEN', 'first_pic_category_LAUNDRY_ROOM',
       'first_pic_category_LIVING_DINING_ROOM',
       'first_pic_category_LIVING_ROOM', 'first_pic_category_MAP_LOCATION',
       'first_pic_category_MOUNTAIN_VIEW', 'first_pic_category_NON_RELATED',
       'first_pic_category_OFFICE', 'first_pic_category_OUTDOOR_BUILDING',
       'first_pic_category_OUTDOOR_HOUSE', 'first_pic_category_PARKING',
       'first_pic_category_POOL', 'first_pic_category_ROOM_BEDROO

In [7]:
with open('../artefacts/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f, protocol=3)

In [171]:
X.sample(5)

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
5840,0.0,0.136363636,0.033164746,0.006072874,0.006322758,0.022222222,0.005,0.098996656,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2696,0.020408163,0.090909091,0.103639831,0.015415758,0.032977523,0.140740741,0.0,0.07181677,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13018,0.0,0.045454545,0.028190034,0.006851448,0.006694685,0.066666667,0.008,0.080434783,0.333333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8602,0.040816327,0.045454545,0.052234475,0.001090003,0.008678296,0.148148148,0.02,0.120652174,0.666666667,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2969,0.020408163,0.090909091,0.082911865,0.004827157,0.014629127,0.133333333,0.006,0.121870883,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size)

In [9]:
model_xgb = xgb.XGBRegressor(n_estimators=1000, objective='count:poisson',max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [10]:
models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(), model_xgb]

In [11]:
for model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_train)
    print(type(model).__name__, mean_absolute_error(Y_train, predictions))

LinearRegression 400.62668251974156
KNeighborsRegressor 346.3657676135781
DecisionTreeRegressor 0.21854168803199672
XGBRegressor 99.57616906789039


In [12]:
for model in models:
    predictions = model.predict(X_test)
    print(type(model).__name__, mean_absolute_error(Y_test, predictions))

LinearRegression 404.95388608451293
KNeighborsRegressor 425.5183180682764
DecisionTreeRegressor 526.7921523730225
XGBRegressor 340.59309294561575


### Cross Validation

In [13]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model_metrics = []
for model in models:
    score = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    model_metrics.append((type(model).__name__, -1*round(score.mean(), 2), round(score.std(),2)))
model_metrics

[('LinearRegression', 16592232358.28, 89351903595.2),
 ('KNeighborsRegressor', 427.87, 23.85),
 ('DecisionTreeRegressor', 506.91, 34.05),
 ('XGBRegressor', 338.93, 24.49)]

This shows that the XGBoost Regressor is the best fit for this use case

In [14]:
model_metrics = pd.DataFrame(model_metrics, columns=["model", "avg_MAE", "std_MAE"]).round(2)
model_metrics

Unnamed: 0,model,avg_MAE,std_MAE
0,LinearRegression,16592232358.28,89351903595.19998
1,KNeighborsRegressor,427.87,23.85
2,DecisionTreeRegressor,506.91,34.05
3,XGBRegressor,338.93,24.49


In [15]:
predictions = model_xgb.predict(X_test)
df = X_test.copy()
df['Prediction'] = predictions
df['Actual'] = Y_test
df["Error"] = Y_test - predictions
df.sample(5)

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW,Prediction,Actual,Error
5296,0.06122449,0.090909091,0.120222204,0.007318592,0.01624081,0.207407407,0.015,0.160869565,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,458.840270996,294,-164.840270996
6646,0.020408163,0.090909091,0.045601526,0.004827157,0.011901663,0.066666667,0.0,0.080434783,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,326.475189209,0,-326.475189209
8059,0.020408163,0.090909091,0.035237542,0.006072874,0.008802271,0.051851852,0.0,0.080434783,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,351.121765137,323,-28.121765137
13466,0.0,0.045454545,0.029019153,0.00264715,0.016860689,0.140740741,0.0,0.037536232,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,566.778015137,456,-110.778015137
9081,0.0,0.045454545,0.016582373,0.001245718,0.015001054,0.044444444,0.015,0.023832528,0.666666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,350.454193115,64,-286.454193115


In [18]:
with open("../models/click_predictor_xgb.pkl", 'wb') as f:
    pickle.dump(model_xgb, f, protocol=4)

In [26]:
df.loc[9081]["Prediction"]

350.4541931152344

In [27]:
click_data.loc[9081]

min_booking_duration                       1.000000000
rooms                                      1.000000000
deposit                                  400.000000000
area                                      19.000000000
price                                   1350.000000000
number_of_pics                             6.000000000
cleaning_fee                             150.000000000
deposit_ratio                              0.296296296
clicks                                    64.000000000
district_popularity                        3.000000000
first_pic_category_BALCONY                 0.000000000
first_pic_category_BATHROOM                0.000000000
first_pic_category_DETAILS                 0.000000000
first_pic_category_DINING_ROOM             0.000000000
first_pic_category_EMPTY_ROOM              0.000000000
first_pic_category_ENERGY_CERTIFICATE      0.000000000
first_pic_category_FLOOR_PLAN              0.000000000
first_pic_category_GARDEN                  0.000000000
first_pic_