In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

### Feature Engineering:

In [52]:
# dropping irrelevant features
train_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)
test_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)

ValueError: labels ['bathrooms' 'bedrooms' 'street_address' 'display_address'] not contained in axis

In [4]:
# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df.head()

Unnamed: 0,building_id,created,description,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price
10,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000
10000,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465
100004,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...","[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850
100007,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275
100013,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350


In [5]:
test_df.head()

Unnamed: 0,building_id,created,description,features,latitude,listing_id,longitude,manager_id,photos,price
0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950
1,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850
100,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,"[Doorman, Elevator, No Fee]",40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758
1000,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300
100000,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900


In [6]:
# creating a created_hour feature
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# and a created_day feature
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day

In [7]:
# creating a column for the number of features of each house
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

In [8]:
# creating a column for the number of photos of each house
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [9]:
# count of words present in description column 
train_df["num_desc_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_desc_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [10]:
# deleting irrelevant features
train_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)
test_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)

In [11]:
# Let's split the data
y = train_df["interest_level"]
X = train_df.drop(['interest_level'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#### Dealing with manager_id:

In [12]:
man_train_list = train_df['manager_id'].unique()
man_test_list = test_df['manager_id'].unique()
man_list = np.concatenate((man_train_list,man_test_list), axis=0)
man_list = list(set(man_list))

In [13]:
df100 = train_df[['manager_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,manager_id,low,medium,high
10,5ba989232d0489da1b5f2c45f6688adc,0,1,0
10000,7533621a882f71e25173b27e3139d83d,1,0,0
100004,d9039c43983f6e564b1482b273bd7b01,0,0,1
100007,1067e078446a7897d2da493d2f741316,1,0,0
100013,98e13ad4b495b9613cef886d79a6291f,1,0,0


In [14]:
gby = pd.concat([df100.groupby('manager_id').mean(),df100.groupby('manager_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['man_low_frac','man_medium_frac','man_high_frac','man_count']

In [15]:
gby['manager_skill'] = gby['man_medium_frac']*1 + gby['man_high_frac']*2
gby.sort_values(by = 'man_count', ascending = False).head()

Unnamed: 0_level_0,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e6472c7237327dd3903b3d6f6a94515a,0.686538,0.245559,0.067904,2533,0.381366
6e5c10246156ae5bdcd9b487ca99d96a,0.905767,0.088608,0.005626,711,0.099859
8f5a9c893f6d602f4953fcc0b8e6e9b4,0.987805,0.009756,0.002439,410,0.014634
62b685cc0d876c3a1a51d63a0d6a8082,1.0,0.0,0.0,402,0.0
cb87dadbca78fad02b388dc9e8f25a5b,0.36193,0.490617,0.147453,373,0.785523


In [16]:

mean_values = gby[['man_low_frac','man_medium_frac','man_high_frac','manager_skill']].mean()
mean_values

man_low_frac       0.722564
man_medium_frac    0.191016
man_high_frac      0.086419
manager_skill      0.363855
dtype: float64

In [17]:
# add the features computed on the training dataset to the train dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_train.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,39661b8ce46a8d71461497819f67c279,40.7471,6832226,-73.9867,ad3d8ddc52c7e0859b5c6c7f7949c3bd,6500,6,6,8,6,130,0.918033,0.078689,0.003279,305,0.085246
1,6f42136c0b436713b38b7772afdf23a8,40.7478,6905009,-73.9571,9df32cb8dda19d3222d66e69e258616b,2345,2,21,11,15,127,0.827273,0.130303,0.042424,330,0.215152
2,0,40.7556,6849575,-73.9922,8262449f40e9117f7a9ea49b4a333993,4870,5,9,11,6,151,0.919048,0.080952,0.0,210,0.080952
3,6a4733ecdeab742ebe73bc6d90c37411,40.7376,7092955,-73.976,1f5630fb3e0dc7d307fa82a6b63a03a1,3395,5,1,15,7,97,0.381579,0.394737,0.223684,76,0.842105
4,6413370ec2d81b31ea97576be5c3ab46,40.6646,6874144,-73.9874,4bb850e243db09298a0bda50f9a99c81,2350,13,14,2,7,36,0.352941,0.588235,0.058824,17,0.705882


In [18]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_test.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,ea11299b288bdb7e740fc2dccfc3b140,40.7399,6819401,-73.9864,e1c98bf7f1a044ec4df76abee5f0f5e6,5150,3,3,9,6,91,0.485714,0.428571,0.085714,70,0.6
1,0a4c8baf64010a94487d47a8573d5b65,40.7364,6938460,-73.9809,4398a550168a2dde02ebd0acefa9e9be,2795,6,28,5,5,89,0.55,0.366667,0.083333,60,0.533333
2,11e1dec9d14b1a9e528386a2504b3afc,40.7053,7009525,-74.0161,528a10d6147e8c3ef03fcaf2b5bd135c,3600,1,14,5,7,31,1.0,0.0,0.0,4,0.0
3,766241ad10fb6a35b12b03758d86cd5c,40.73,6988117,-73.9864,2d384fbd4c8b82700852c62187ca50df,2825,2,10,2,4,83,1.0,0.0,0.0,35,0.0
4,1edde1c89233fb1b1a38c53e992b0756,40.7317,6975746,-73.9821,9f39caedae295bf81e67463e6fd0af40,3250,5,6,4,4,120,0.574074,0.395062,0.030864,162,0.45679


In [19]:
# add the features computed on the training dataset to the test_df dataset
test_df = train_df.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
new_manager_ixes = test_df['man_high_frac'].isnull()
test_df.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
test_df.head()

Unnamed: 0,building_id,interest_level,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,53a5b119ba8f7b61d4e010512e0dfc85,medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,3000,7,24,0,5,95,0.744444,0.255556,0.0,90,0.255556
1,c5c8a357cba207596b04d1afd1e4f130,low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,5465,12,12,5,11,9,0.988372,0.011628,0.0,86,0.011628
2,c3ba40552e2120b0acfc3cb5730bb2aa,high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,2850,3,17,4,8,94,0.574627,0.365672,0.059701,134,0.485075
3,28d9ad350afeaab8027513a3e52ac8d5,low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,3275,2,18,2,3,80,0.806283,0.125654,0.068063,191,0.26178
4,0,low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,3350,1,28,1,3,68,1.0,0.0,0.0,15,0.0


#### Dealing with building_id in a similar way as manager_id:

In [20]:
bld_train_list = train_df['building_id'].unique()
bld_test_list = test_df['building_id'].unique()
bld_list = np.concatenate((bld_train_list,bld_test_list), axis=0)
bld_list = list(set(bld_list))

In [21]:

df100 = train_df[['building_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,building_id,low,medium,high
10,53a5b119ba8f7b61d4e010512e0dfc85,0,1,0
10000,c5c8a357cba207596b04d1afd1e4f130,1,0,0
100004,c3ba40552e2120b0acfc3cb5730bb2aa,0,0,1
100007,28d9ad350afeaab8027513a3e52ac8d5,1,0,0
100013,0,1,0,0


In [22]:
gby = pd.concat([df100.groupby('building_id').mean(),df100.groupby('building_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['bd_low_frac','bd_medium_frac','bd_high_frac','bd_count']

In [23]:
gby['bd_avg_interest'] = gby['bd_medium_frac']*1 + gby['bd_high_frac']*2
gby.sort_values(by = 'bd_count', ascending = False).head()

Unnamed: 0_level_0,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.915762,0.060705,0.023534,8286,0.107772
96274288c84ddd7d5c5d8e425ee75027,0.650909,0.272727,0.076364,275,0.425455
11e1dec9d14b1a9e528386a2504b3afc,0.674419,0.24186,0.083721,215,0.409302
80a120d6bc3aba97f40fee8c2204524b,0.586854,0.328638,0.084507,213,0.497653
bb8658a3e432fb62a440615333376345,0.523585,0.353774,0.122642,212,0.599057


In [24]:
mean_values = gby[['bd_low_frac','bd_medium_frac','bd_high_frac','bd_avg_interest']].mean()
mean_values
# this is what I added

bd_low_frac        0.553652
bd_medium_frac     0.300489
bd_high_frac       0.145859
bd_avg_interest    0.592207
dtype: float64

In [25]:
# add the features computed on the training dataset to the training dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
X_train.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,...,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,39661b8ce46a8d71461497819f67c279,40.7471,6832226,-73.9867,ad3d8ddc52c7e0859b5c6c7f7949c3bd,6500,6,6,8,6,...,0.918033,0.078689,0.003279,305,0.085246,0.941176,0.058824,0.0,17,0.058824
1,6f42136c0b436713b38b7772afdf23a8,40.7478,6905009,-73.9571,9df32cb8dda19d3222d66e69e258616b,2345,2,21,11,15,...,0.827273,0.130303,0.042424,330,0.215152,0.785714,0.214286,0.0,14,0.214286
2,0,40.7556,6849575,-73.9922,8262449f40e9117f7a9ea49b4a333993,4870,5,9,11,6,...,0.919048,0.080952,0.0,210,0.080952,0.915762,0.060705,0.023534,8286,0.107772
3,6a4733ecdeab742ebe73bc6d90c37411,40.7376,7092955,-73.976,1f5630fb3e0dc7d307fa82a6b63a03a1,3395,5,1,15,7,...,0.381579,0.394737,0.223684,76,0.842105,0.4,0.466667,0.133333,30,0.733333
4,6413370ec2d81b31ea97576be5c3ab46,40.6646,6874144,-73.9874,4bb850e243db09298a0bda50f9a99c81,2350,13,14,2,7,...,0.352941,0.588235,0.058824,17,0.705882,1.0,0.0,0.0,1,0.0


In [26]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,interest_level,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,53a5b119ba8f7b61d4e010512e0dfc85,medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,3000,7,24,0,5,95,0.744444,0.255556,0.0,90,0.255556
1,c5c8a357cba207596b04d1afd1e4f130,low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,5465,12,12,5,11,9,0.988372,0.011628,0.0,86,0.011628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49350,ad67f6181a49bde19218929b401b31b7,low,40.7066,6841891,-74.0101,9fd3af5b2d23951e028059e8940a55d7,3350,2,8,5,9,99,1.000000,0.000000,0.0,2,0.000000
49351,5173052db6efc0caaa4d817112a70f32,low,40.8699,6858245,-73.9172,d7f57128272bfd82e33a61999b5f4c42,2200,2,12,1,7,133,0.937500,0.062500,0.0,16,0.062500


In [27]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
new_manager_ixes = test_df['bd_high_frac'].isnull()
test_df.loc[new_manager_ixes,['bd_high_frac','bd_low_frac', 'bd_medium_frac','bd_avg_interest']] = mean_values.values

with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,interest_level,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,53a5b119ba8f7b61d4e010512e0dfc85,medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,3000,7,24,0,5,95,0.744444,0.255556,0.0,90,0.255556,0.666667,0.333333,0.000000,3,0.333333
1,c5c8a357cba207596b04d1afd1e4f130,low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,5465,12,12,5,11,9,0.988372,0.011628,0.0,86,0.011628,0.916667,0.083333,0.000000,24,0.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49350,ad67f6181a49bde19218929b401b31b7,low,40.7066,6841891,-74.0101,9fd3af5b2d23951e028059e8940a55d7,3350,2,8,5,9,99,1.000000,0.000000,0.0,2,0.000000,0.761905,0.222222,0.015873,63,0.253968
49351,5173052db6efc0caaa4d817112a70f32,low,40.8699,6858245,-73.9172,d7f57128272bfd82e33a61999b5f4c42,2200,2,12,1,7,133,0.937500,0.062500,0.0,16,0.062500,0.428571,0.428571,0.142857,7,0.714286


In [28]:
test_df.drop(['manager_id','man_count', 'bd_count', 'building_id', 'interest_level'], axis=1, inplace=True)
X_train.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
X_test.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
test_df.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7145,7211212,-73.9425,3000,7,24,0,5,95,0.744444,0.255556,0.0,0.255556,0.666667,0.333333,0.0,0.333333
1,40.7947,7150865,-73.9667,5465,12,12,5,11,9,0.988372,0.011628,0.0,0.011628,0.916667,0.083333,0.0,0.083333
2,40.7388,6887163,-74.0018,2850,3,17,4,8,94,0.574627,0.365672,0.059701,0.485075,0.37931,0.448276,0.172414,0.793103
3,40.7539,6888711,-73.9677,3275,2,18,2,3,80,0.806283,0.125654,0.068063,0.26178,0.89899,0.090909,0.010101,0.111111
4,40.8241,6934781,-73.9493,3350,1,28,1,3,68,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772


In [29]:
X_train.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7471,6832226,-73.9867,6500,6,6,8,6,130,0.918033,0.078689,0.003279,0.085246,0.941176,0.058824,0.0,0.058824
1,40.7478,6905009,-73.9571,2345,2,21,11,15,127,0.827273,0.130303,0.042424,0.215152,0.785714,0.214286,0.0,0.214286
2,40.7556,6849575,-73.9922,4870,5,9,11,6,151,0.919048,0.080952,0.0,0.080952,0.915762,0.060705,0.023534,0.107772
3,40.7376,7092955,-73.976,3395,5,1,15,7,97,0.381579,0.394737,0.223684,0.842105,0.4,0.466667,0.133333,0.733333
4,40.6646,6874144,-73.9874,2350,13,14,2,7,36,0.352941,0.588235,0.058824,0.705882,1.0,0.0,0.0,0.0


In [30]:
X_test.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7399,6819401,-73.9864,5150,3,3,9,6,91,0.485714,0.428571,0.085714,0.6,0.492537,0.432836,0.074627,0.58209
1,40.7364,6938460,-73.9809,2795,6,28,5,5,89,0.55,0.366667,0.083333,0.533333,0.5,0.214286,0.285714,0.785714
2,40.7053,7009525,-74.0161,3600,1,14,5,7,31,1.0,0.0,0.0,0.0,0.674419,0.24186,0.083721,0.409302
3,40.73,6988117,-73.9864,2825,2,10,2,4,83,1.0,0.0,0.0,0.0,0.842105,0.147368,0.010526,0.168421
4,40.7317,6975746,-73.9821,3250,5,6,4,4,120,0.574074,0.395062,0.030864,0.45679,0.936508,0.015873,0.047619,0.111111


#### Dealing with listing_id:

In [31]:
#7250000
combine = [test_df, X_train, X_test]
for com in combine:
    com['listing_id'] = com['listing_id'].apply(lambda x: 1 if x < 7250000 else 0)

print("Test_df: \n" + str(test_df['listing_id'].value_counts()) + "\n \n")
print("X_train: \n" + str(X_train['listing_id'].value_counts()) + "\n \n")
print("X_test: \n" + str(X_test['listing_id'].value_counts()) + "\n \n")

Test_df: 
1    49095
0      257
Name: listing_id, dtype: int64
 

X_train: 
1    32893
0      172
Name: listing_id, dtype: int64
 

X_test: 
1    16202
0       85
Name: listing_id, dtype: int64
 



In [32]:
int_lev_dict = {'low': 2, 'medium': 1, 'high': 0}

y_train = y_train.apply(lambda x: int_lev_dict[x])

y_test = y_test.apply(lambda x: int_lev_dict[x])

### XgBoost and parameter tuning:

In [33]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics       
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  
from time import perf_counter as timer

import matplotlib.pylab as plt
%matplotlib inline

In [34]:
def modelfit(alg, X_train, y_train, X_test, y_test, early_stopping_rounds = 50, cv_folds=5):
    
    xgb_param = alg.get_xgb_params()
    xgb_param['objective'] = 'multi:softprob'
    xgb_param['num_class'] = 3
    xgb_param['silent'] = 0
    
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
    
    alg.set_params(n_estimators=cvresult.shape[0])
        
    alg.fit(X_train, y_train)
    
    train_preds = alg.predict(X_train)
    test_preds = alg.predict(X_test)
    
    print('MODEL REPORT: \n')
    print('Number of estimators = ' + str(cvresult.shape[0]))
    print("Accuracy on train set : %.6g" % metrics.accuracy_score(y_train, train_preds))
    print("Accuracy on test set : %.6g" % metrics.accuracy_score(y_test, test_preds))

In [35]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb1, X_train, y_train, X_test, y_test)

  if diff:


MODEL REPORT: 

Number of estimators = 133
Accuracy on train set : 0.831423
Accuracy on test set : 0.809603


  if diff:


#### Tuning max_depth and min_child_weight:

In [36]:
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
start = timer()

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                     param_grid = param_test1,n_jobs=4,iid=False, cv=5)

gsearch1.fit(X_train,y_train)

end = timer()

print('GridSearchCV time: %.2f' % (end-start))
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

GridSearchCV time: 147.41




([mean: 0.80699, std: 0.00255, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.80732, std: 0.00262, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.80705, std: 0.00231, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.80738, std: 0.00299, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.80820, std: 0.00286, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.80774, std: 0.00303, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.80209, std: 0.00330, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.80348, std: 0.00275, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.80281, std: 0.00367, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.79764, std: 0.00228, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.79903, std: 0.00133, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.80006, std: 0.00172, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_child_weight': 3

In [37]:
start = timer()

rsearch1 = RandomizedSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                     param_distributions = param_test1,n_iter=7,n_jobs=4,iid=False, cv=5)

rsearch1.fit(X_train,y_train)

end = timer()

print('RandomizedSearchCV time: %.2f ' % (end-start))

rsearch1.grid_scores_, rsearch1.best_params_, rsearch1.best_score_

RandomizedSearchCV time: 91.90 




([mean: 0.80348, std: 0.00275, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.80006, std: 0.00172, params: {'min_child_weight': 5, 'max_depth': 9},
  mean: 0.80820, std: 0.00286, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.80732, std: 0.00262, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.80774, std: 0.00303, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.80281, std: 0.00367, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: 0.79764, std: 0.00228, params: {'min_child_weight': 1, 'max_depth': 9}],
 {'max_depth': 5, 'min_child_weight': 3},
 0.8081958981200315)

#### Tuning gamma:

In [38]:
param_test1 = {
    'gamma':[i/10.0 for i in range(0,5)]
}

start = timer()

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                     param_grid = param_test1,n_jobs=4,iid=False, cv=5)

gsearch1.fit(X_train,y_train)

end = timer()

print('GridSearchCV time: %.2f' % (end-start))
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

GridSearchCV time: 52.48




([mean: 0.80738, std: 0.00299, params: {'gamma': 0.0},
  mean: 0.80774, std: 0.00163, params: {'gamma': 0.1},
  mean: 0.80826, std: 0.00259, params: {'gamma': 0.2},
  mean: 0.80738, std: 0.00268, params: {'gamma': 0.3},
  mean: 0.80814, std: 0.00175, params: {'gamma': 0.4}],
 {'gamma': 0.2},
 0.8082563941905534)

#### Tuning subsample and colsample_bytree:

In [39]:
param_test1 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

start = timer()

rsearch1 = RandomizedSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                     param_distributions = param_test1,n_jobs=4,n_iter=10,iid=False, cv=5)

rsearch1.fit(X_train,y_train)

end = timer()

print('RandomizedSearchCV time: %.2f ' % (end-start))

rsearch1.grid_scores_, rsearch1.best_params_, rsearch1.best_score_

RandomizedSearchCV time: 97.49 




([mean: 0.80738, std: 0.00189, params: {'subsample': 0.8, 'colsample_bytree': 0.9},
  mean: 0.80711, std: 0.00243, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.80747, std: 0.00201, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
  mean: 0.80750, std: 0.00215, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
  mean: 0.80811, std: 0.00240, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
  mean: 0.80883, std: 0.00369, params: {'subsample': 0.7, 'colsample_bytree': 0.9},
  mean: 0.80735, std: 0.00321, params: {'subsample': 0.9, 'colsample_bytree': 0.8},
  mean: 0.80847, std: 0.00278, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.80738, std: 0.00299, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.80671, std: 0.00315, params: {'subsample': 0.6, 'colsample_bytree': 0.7}],
 {'colsample_bytree': 0.9, 'subsample': 0.7},
 0.8088309101815427)

#### Tuning regularization parameters (alpha and lambda):

In [40]:
param_test1 = {
    'reg_alpha':[0, 0.005, 0.05, 0.5, 0.7, 1],
    'reg_lambda':[0, 0.001, 0.01, 0.1, 0.5, 0.7, 1]
}

start = timer()

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.7, nthread=4, scale_pos_weight=1, seed=27),
                     param_grid = param_test1,n_jobs=4,iid=False, cv=5)

gsearch1.fit(X_train,y_train)

end = timer()

print('GridSearchCV time: %.2f' % (end-start))
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

GridSearchCV time: 362.97




([mean: 0.80786, std: 0.00262, params: {'reg_alpha': 0, 'reg_lambda': 0},
  mean: 0.80798, std: 0.00266, params: {'reg_alpha': 0, 'reg_lambda': 0.001},
  mean: 0.80826, std: 0.00282, params: {'reg_alpha': 0, 'reg_lambda': 0.01},
  mean: 0.80765, std: 0.00253, params: {'reg_alpha': 0, 'reg_lambda': 0.1},
  mean: 0.80741, std: 0.00236, params: {'reg_alpha': 0, 'reg_lambda': 0.5},
  mean: 0.80804, std: 0.00271, params: {'reg_alpha': 0, 'reg_lambda': 0.7},
  mean: 0.80747, std: 0.00201, params: {'reg_alpha': 0, 'reg_lambda': 1},
  mean: 0.80711, std: 0.00204, params: {'reg_alpha': 0.005, 'reg_lambda': 0},
  mean: 0.80895, std: 0.00299, params: {'reg_alpha': 0.005, 'reg_lambda': 0.001},
  mean: 0.80783, std: 0.00240, params: {'reg_alpha': 0.005, 'reg_lambda': 0.01},
  mean: 0.80811, std: 0.00285, params: {'reg_alpha': 0.005, 'reg_lambda': 0.1},
  mean: 0.80771, std: 0.00269, params: {'reg_alpha': 0.005, 'reg_lambda': 0.5},
  mean: 0.80750, std: 0.00246, params: {'reg_alpha': 0.005, 'reg_lam

I used GridSearchCV for tuning alpha and lambda. These were the values passed in to GridSearchCV:<br/><br/>
    'reg_alpha' : [0, 0.005, 0.05, 0.5, 0.7, 1],<br/>
    'reg_lambda' : [0, 0.001, 0.01, 0.1, 0.5, 0.7, 1]<br/>
    
With this number of iterations GridSearchCV is going to take a really long time. So I decided to leave it out from this
notebook, just for running time sake. But I did GridSearchCV on my own computer and these are the best parameters:

 { 'reg_alpha' : 0.7,  'reg_lambda' : 0.001 },<br/>
 0.8091331967044413)

In [61]:
xgb_final = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=5,
 min_child_weight=3,
 gamma=0.2,
 subsample=0.7,
 colsample_bytree=0.9,
 reg_alpha=0.7,
 reg_lambda=0.001,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb_final, X_train, y_train, X_test, y_test)

  if diff:


ValueError: feature_names mismatch: ['latitude', 'listing_id', 'longitude', 'price', 'created_hour', 'created_day', 'num_features', 'num_photos', 'num_desc_words', 'man_low_frac', 'man_medium_frac', 'man_high_frac', 'manager_skill', 'bd_low_frac', 'bd_medium_frac', 'bd_high_frac', 'bd_avg_interest', 'interest_level'] ['latitude', 'listing_id', 'longitude', 'price', 'created_hour', 'created_day', 'num_features', 'num_photos', 'num_desc_words', 'man_low_frac', 'man_medium_frac', 'man_high_frac', 'manager_skill', 'bd_low_frac', 'bd_medium_frac', 'bd_high_frac', 'bd_avg_interest']
expected interest_level in input data

#### Recombining the splitted datasets, so they can be used for final training:

In [60]:
train_df = pd.concat([X_train, X_test], ignore_index=True)
train_df.drop('interest_level', axis=1, inplace=True)

target_df = pd.concat([y_train, y_test], ignore_index=True)

train_df['interest_level'] = target_df
train_df.head()

Unnamed: 0,bd_avg_interest,bd_high_frac,bd_low_frac,bd_medium_frac,created_day,created_hour,latitude,listing_id,longitude,man_high_frac,man_low_frac,man_medium_frac,manager_skill,num_desc_words,num_features,num_photos,price,interest_level
0,0.058824,0.0,0.941176,0.058824,6,6,40.7471,1,-73.9867,0.003279,0.918033,0.078689,0.085246,130,8,6,6500,2
1,0.214286,0.0,0.785714,0.214286,21,2,40.7478,1,-73.9571,0.042424,0.827273,0.130303,0.215152,127,11,15,2345,2
2,0.107772,0.023534,0.915762,0.060705,9,5,40.7556,1,-73.9922,0.0,0.919048,0.080952,0.080952,151,11,6,4870,2
3,0.733333,0.133333,0.4,0.466667,1,5,40.7376,1,-73.976,0.223684,0.381579,0.394737,0.842105,97,15,7,3395,1
4,0.0,0.0,1.0,0.0,14,13,40.6646,1,-73.9874,0.058824,0.352941,0.588235,0.705882,36,2,7,2350,2
