In [95]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Sklearn regression model evaluation function
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [96]:
click_data = pd.read_csv("../data/preprocessed_data.csv")

In [97]:
click_data.head()

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,clicks,district_popularity,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
0,3,3,4000.0,103.0,2550.0,46.0,150.0,1.568627,562,2,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2000.0,58.0,1290.0,23.0,85.0,1.550388,306,2,...,0,0,0,0,0,0,0,0,0,0
2,2,3,2385.0,73.0,1969.0,26.0,249.0,1.211275,1468,2,...,0,0,0,0,0,0,0,0,0,0
3,6,2,1200.0,65.0,1350.0,11.0,0.0,0.888889,0,2,...,0,1,0,0,0,0,0,0,0,0
4,2,2,2000.0,63.0,1950.0,0.0,100.0,1.025641,0,2,...,0,0,0,0,0,0,0,1,0,0


In [98]:
X = click_data.drop("clicks", axis = 1)
y = click_data["clicks"]
# Rescale the data
scaler = MinMaxScaler(feature_range=(0,1))
#rescaledX = scaler.fit_transform(X)

# Convert X back to a Pandas DataFrame, for convenience
#X = pd.DataFrame(rescaledX, index=X.index, columns=X.columns)

In [99]:
X.sample(5)

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_POOL,first_pic_category_ROOM_BEDROOM,first_pic_category_STAIRS,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW
4649,3,1,1300.0,30.0,1679.0,16.0,100.0,0.77427,4,0,...,0,1,0,0,0,0,0,0,0,0
11892,3,1,890.0,35.0,890.0,25.0,80.0,1.0,2,0,...,0,0,0,0,0,0,0,0,0,0
7965,1,2,1500.0,65.0,980.0,23.0,90.0,1.530612,3,0,...,0,1,0,0,0,0,0,0,0,0
411,1,1,500.0,35.0,2400.0,8.0,100.0,0.208333,4,0,...,0,0,0,0,0,0,0,0,0,0
1260,3,2,2190.0,82.0,2190.0,6.0,200.0,1.0,4,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size)

In [101]:
models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor()]

In [102]:
for model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_train)
    print(type(model).__name__, mean_absolute_error(Y_train, predictions))

LinearRegression 405.1429402187521
KNeighborsRegressor 339.50684032406934
DecisionTreeRegressor 0.717806720678221


In [103]:
for model in models:
    predictions = model.predict(X_test)
    print(type(model).__name__, mean_absolute_error(Y_test, predictions))

LinearRegression 400.41639511236286
KNeighborsRegressor 424.4772273105745
DecisionTreeRegressor 484.6513322231474


In [88]:
predictions = models[0].predict(X_test)
df = X_test.copy()
df['Prediction'] = predictions
df['Actual'] = Y_test
df["Error"] = Y_test - predictions
df

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW,Prediction,Actual,Error
5648,0.000000,0.045455,0.015753,0.006851,0.002975,0.000000,0.008,0.080435,0.666667,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,128.250,0,-128.250
12409,0.000000,0.045455,0.041456,0.004204,0.010538,0.111111,0.010,0.081247,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,473.500,149,-324.500
672,0.102041,0.045455,0.062184,0.002024,0.012893,0.074074,0.007,0.102248,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,416.875,377,-39.875
10494,0.102041,0.045455,0.082083,0.002959,0.010538,0.103704,0.015,0.160870,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,525.500,63,-462.500
5017,0.122449,0.045455,0.054224,0.004204,0.014480,0.118519,0.005,0.080435,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,440.500,369,-71.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,0.020408,0.045455,0.037310,0.000934,0.008554,0.081481,0.008,0.087218,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,412.000,2497,2085.000
8366,0.061224,0.136364,0.103640,0.011056,0.019340,0.081481,0.020,0.118286,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,376.125,111,-265.125
1728,0.000000,0.090909,0.082912,0.007941,0.012893,0.088889,0.015,0.136330,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,413.750,413,-0.750
4535,0.040816,0.136364,0.041456,0.009810,0.013141,0.037037,0.012,0.067029,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,270.375,0,-270.375


### XGBoost Model

In [104]:
import xgboost as xgb

In [105]:
model = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [106]:
predictions = model.predict(X_train)
print(type(model).__name__, mean_absolute_error(Y_train, predictions))
predictions = model.predict(X_test)
print(type(model).__name__, mean_absolute_error(Y_test, predictions))

XGBRegressor 54.10413955669931
XGBRegressor 431.25846182881895


In [107]:
click_data.clicks.describe()

count    14555.000000
mean       378.260941
std        945.425267
min          0.000000
25%          0.000000
50%        103.000000
75%        483.000000
max      38458.000000
Name: clicks, dtype: float64

In [108]:
predictions = model.predict(X_test)
df = X_test.copy()
df['Prediction'] = predictions
df['Actual'] = Y_test
df["Error"] = Y_test - predictions
df

Unnamed: 0,min_booking_duration,rooms,deposit,area,price,number_of_pics,cleaning_fee,deposit_ratio,district_popularity,first_pic_category_BALCONY,...,first_pic_category_STORAGE_PANTRY,first_pic_category_TERRACE,first_pic_category_THREE_D_FLOOR_PLAN,first_pic_category_TWO_D_FLOOR_PLAN,first_pic_category_UNCATEGORIZED,first_pic_category_WALK_IN_CLOSET,first_pic_category_WATER_VIEW,Prediction,Actual,Error
8351,3,2,1000.0,60.0,1050.0,7.0,50.0,0.952381,3,0,...,0,0,0,0,0,0,0,70.317863,0,-70.317863
8473,1,3,1500.0,80.0,1500.0,4.0,150.0,1.000000,3,0,...,0,0,0,0,0,0,0,246.824814,561,314.175186
6213,2,4,1000.0,110.0,1800.0,11.0,100.0,0.555556,3,0,...,0,0,0,0,0,0,0,373.769745,85,-288.769745
546,6,1,950.0,55.0,950.0,9.0,80.0,1.000000,4,0,...,0,0,0,0,0,0,0,-33.591091,0,33.591091
12419,6,2,1000.0,42.0,1000.0,18.0,80.0,1.000000,2,0,...,0,0,0,0,0,0,0,283.142517,0,-283.142517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7678,3,3,1000.0,90.0,2400.0,14.0,150.0,0.416667,3,0,...,0,0,0,0,0,0,0,399.639526,209,-190.639526
12367,9,5,3300.0,101.0,1800.0,12.0,200.0,1.833333,1,0,...,0,0,0,0,0,0,0,384.256165,1367,982.743835
9775,2,1,1100.0,36.0,1100.0,21.0,150.0,1.000000,3,0,...,0,0,0,0,0,0,0,1667.378540,127,-1540.378540
10897,1,2,800.0,25.0,1265.0,20.0,59.0,0.632411,2,0,...,0,0,0,0,1,0,0,407.986298,263,-144.986298
