In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [3]:
test_df.shape

(74659, 14)

### Feature Engineering:

In [4]:
test_df_id = test_df['listing_id']

In [5]:
# dropping irrelevant features
train_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address','listing_id'], axis=1, inplace=True)
test_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address','listing_id'], axis=1, inplace=True)

In [6]:
test_df.shape

(74659, 9)

In [7]:
test_df.head()

Unnamed: 0,building_id,created,description,features,latitude,longitude,manager_id,photos,price
0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950
1,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850
100,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,"[Doorman, Elevator, No Fee]",40.7306,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758
1000,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300
100000,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900


In [8]:
# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df.head()

Unnamed: 0,building_id,created,description,features,interest_level,latitude,longitude,manager_id,photos,price
10,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,[],medium,40.7145,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000
10000,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465
100004,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...","[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850
100007,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,"[Hardwood Floors, No Fee]",low,40.7539,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275
100013,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,[Pre-War],low,40.8241,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350


In [9]:
test_df.head()

Unnamed: 0,building_id,created,description,features,latitude,longitude,manager_id,photos,price
0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950
1,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850
100,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,"[Doorman, Elevator, No Fee]",40.7306,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758
1000,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300
100000,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900


In [10]:
# creating a created_hour feature
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# and a created_day feature
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day

In [11]:
# creating a column for the number of features of each house
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

In [12]:
# creating a column for the number of photos of each house
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [13]:
# count of words present in description column 
train_df["num_desc_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_desc_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [14]:
# deleting irrelevant features
train_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)
test_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)

In [15]:
# Let's split the data
y = train_df["interest_level"]
X = train_df.drop(['interest_level'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

#### Dealing with manager_id:

In [16]:
man_train_list = train_df['manager_id'].unique()
man_test_list = test_df['manager_id'].unique()
man_list = np.concatenate((man_train_list,man_test_list), axis=0)
man_list = list(set(man_list))

In [17]:
df100 = train_df[['manager_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,manager_id,low,medium,high
10,5ba989232d0489da1b5f2c45f6688adc,0,1,0
10000,7533621a882f71e25173b27e3139d83d,1,0,0
100004,d9039c43983f6e564b1482b273bd7b01,0,0,1
100007,1067e078446a7897d2da493d2f741316,1,0,0
100013,98e13ad4b495b9613cef886d79a6291f,1,0,0


In [18]:
test_df.shape

(74659, 10)

In [19]:
gby = pd.concat([df100.groupby('manager_id').mean(),df100.groupby('manager_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['man_low_frac','man_medium_frac','man_high_frac','man_count']

In [20]:
gby['manager_skill'] = gby['man_medium_frac']*1 + gby['man_high_frac']*2
gby.sort_values(by = 'man_count', ascending = False).head()

Unnamed: 0_level_0,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e6472c7237327dd3903b3d6f6a94515a,0.686538,0.245559,0.067904,2533,0.381366
6e5c10246156ae5bdcd9b487ca99d96a,0.905767,0.088608,0.005626,711,0.099859
8f5a9c893f6d602f4953fcc0b8e6e9b4,0.987805,0.009756,0.002439,410,0.014634
62b685cc0d876c3a1a51d63a0d6a8082,1.0,0.0,0.0,402,0.0
cb87dadbca78fad02b388dc9e8f25a5b,0.36193,0.490617,0.147453,373,0.785523


In [21]:

mean_values = gby[['man_low_frac','man_medium_frac','man_high_frac','manager_skill']].mean()
mean_values

man_low_frac       0.722564
man_medium_frac    0.191016
man_high_frac      0.086419
manager_skill      0.363855
dtype: float64

In [22]:
# add the features computed on the training dataset to the train dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_train.head()

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,9fc5b7d391cc9c2e36102e496344e29f,40.7479,-73.9739,e6472c7237327dd3903b3d6f6a94515a,3050,3,28,11,3,92,0.686538,0.245559,0.067904,2533,0.381366
1,7967a1280bf3f7644500fc79d2696b0e,40.7601,-73.99,50a6b80a34192d1f0ab1ef363a7e0a3e,2099,5,24,9,4,117,0.354167,0.520833,0.125,48,0.770833
2,57ef86c28a8ae482dc3a3c3af28e8e48,40.7593,-73.9959,6e5c10246156ae5bdcd9b487ca99d96a,5067,2,15,8,6,114,0.905767,0.088608,0.005626,711,0.099859
3,8b778539c56f5a60023b1305a708a76b,40.7315,-74.0025,95dc9b4f3c3ee921254be49230a9c680,3700,3,7,3,4,24,0.672131,0.245902,0.081967,61,0.409836
4,0,40.7298,-73.9816,94d8d6e86aa95eccf7aeac049f866109,2950,1,22,3,0,64,1.0,0.0,0.0,5,0.0


In [23]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_test.head()

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,f68bf347f99df026f4faad43cc604048,40.7436,-73.9727,09703fead6059562ab26a3522f738f87,3482,2,29,4,6,35,1.0,0.0,0.0,15,0.0
1,7cd1d31bbc9bec35b5ce545e268a1c02,40.7378,-74.0084,7926a6d7110757ff84fd6c692f8761fe,4700,5,17,5,3,89,0.571429,0.357143,0.071429,14,0.5
2,c40e3e74475a91aae1928541be8df678,40.796,-73.9742,e6472c7237327dd3903b3d6f6a94515a,4800,4,22,9,6,145,0.686538,0.245559,0.067904,2533,0.381366
3,26b7ab18fb7c34b2aa6c77a4e5c5a327,40.781,-73.9529,cb87dadbca78fad02b388dc9e8f25a5b,2700,5,15,3,5,112,0.36193,0.490617,0.147453,373,0.785523
4,0,40.732,-73.9745,332f095045d8ec4f442ad943e1a01d2f,3325,5,3,5,0,1,1.0,0.0,0.0,19,0.0


In [24]:
gby.shape

(3481, 5)

In [25]:
test_df.shape

(74659, 10)

In [26]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

new_manager_ixes = test_df['man_high_frac'].isnull()
test_df.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
test_df.head()

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,79780be1514f645d7e6be99a3de696c5,40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.75
1,0,40.7278,-74.0,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.0,0.0,0.0,9.0,0.0
2,3dbbb69fd52e0d25131aa1cd459c87eb,40.7306,-73.989,9ca6f3baa475c37a3b3521a394d65467,3758,4,3,3,6,333,0.0,1.0,0.0,1.0,1.0
3,783d21d013a7e655bddc4ed0d461cc5e,40.7109,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,3300,6,11,10,6,204,0.245902,0.393443,0.360656,61.0,1.114754
4,6134e7c4dd1a98d9aee36623c9872b49,40.765,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,4900,5,12,14,7,174,0.597222,0.319444,0.083333,72.0,0.486111


In [27]:
test_df.shape

(74659, 15)

#### Dealing with building_id in a similar way as manager_id:

In [28]:
bld_train_list = train_df['building_id'].unique()
bld_test_list = test_df['building_id'].unique()
bld_list = np.concatenate((bld_train_list,bld_test_list), axis=0)
bld_list = list(set(bld_list))

In [29]:

df100 = train_df[['building_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,building_id,low,medium,high
10,53a5b119ba8f7b61d4e010512e0dfc85,0,1,0
10000,c5c8a357cba207596b04d1afd1e4f130,1,0,0
100004,c3ba40552e2120b0acfc3cb5730bb2aa,0,0,1
100007,28d9ad350afeaab8027513a3e52ac8d5,1,0,0
100013,0,1,0,0


In [30]:
gby = pd.concat([df100.groupby('building_id').mean(),df100.groupby('building_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['bd_low_frac','bd_medium_frac','bd_high_frac','bd_count']

In [31]:
gby['bd_avg_interest'] = gby['bd_medium_frac']*1 + gby['bd_high_frac']*2
gby.sort_values(by = 'bd_count', ascending = False).head()

Unnamed: 0_level_0,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.915762,0.060705,0.023534,8286,0.107772
96274288c84ddd7d5c5d8e425ee75027,0.650909,0.272727,0.076364,275,0.425455
11e1dec9d14b1a9e528386a2504b3afc,0.674419,0.24186,0.083721,215,0.409302
80a120d6bc3aba97f40fee8c2204524b,0.586854,0.328638,0.084507,213,0.497653
bb8658a3e432fb62a440615333376345,0.523585,0.353774,0.122642,212,0.599057


In [32]:
mean_values = gby[['bd_low_frac','bd_medium_frac','bd_high_frac','bd_avg_interest']].mean()
mean_values

bd_low_frac        0.553652
bd_medium_frac     0.300489
bd_high_frac       0.145859
bd_avg_interest    0.592207
dtype: float64

In [33]:
# add the features computed on the training dataset to the training dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
X_train.head()

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,9fc5b7d391cc9c2e36102e496344e29f,40.7479,-73.9739,e6472c7237327dd3903b3d6f6a94515a,3050,3,28,11,3,92,0.686538,0.245559,0.067904,2533,0.381366,0.659091,0.295455,0.045455,44,0.386364
1,7967a1280bf3f7644500fc79d2696b0e,40.7601,-73.99,50a6b80a34192d1f0ab1ef363a7e0a3e,2099,5,24,9,4,117,0.354167,0.520833,0.125,48,0.770833,0.646617,0.293233,0.06015,133,0.413534
2,57ef86c28a8ae482dc3a3c3af28e8e48,40.7593,-73.9959,6e5c10246156ae5bdcd9b487ca99d96a,5067,2,15,8,6,114,0.905767,0.088608,0.005626,711,0.099859,0.647799,0.308176,0.044025,159,0.396226
3,8b778539c56f5a60023b1305a708a76b,40.7315,-74.0025,95dc9b4f3c3ee921254be49230a9c680,3700,3,7,3,4,24,0.672131,0.245902,0.081967,61,0.409836,0.3,0.6,0.1,10,0.8
4,0,40.7298,-73.9816,94d8d6e86aa95eccf7aeac049f866109,2950,1,22,3,0,64,1.0,0.0,0.0,5,0.0,0.915762,0.060705,0.023534,8286,0.107772


In [34]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,79780be1514f645d7e6be99a3de696c5,40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.750000
1,0,40.7278,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.000000,0.000000,0.000000,9.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,67ab535b820c8292ac59cfcffd8974e3,40.7792,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,15,16,3,3,203,0.684211,0.236842,0.078947,38.0,0.394737
74658,be97e14c554ba6a01d26243ca5eefb82,40.7145,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,2,26,3,2,81,0.888889,0.111111,0.000000,9.0,0.111111


In [35]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
new_manager_ixes = test_df['bd_high_frac'].isnull()
test_df.loc[new_manager_ixes,['bd_high_frac','bd_low_frac', 'bd_medium_frac','bd_avg_interest']] = mean_values.values

with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,latitude,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,79780be1514f645d7e6be99a3de696c5,40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.750000,0.000000,1.000000,0.000000,3.0,1.000000
1,0,40.7278,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.000000,0.000000,0.000000,9.0,0.000000,0.915762,0.060705,0.023534,8286.0,0.107772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,67ab535b820c8292ac59cfcffd8974e3,40.7792,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,15,16,3,3,203,0.684211,0.236842,0.078947,38.0,0.394737,1.000000,0.000000,0.000000,3.0,0.000000
74658,be97e14c554ba6a01d26243ca5eefb82,40.7145,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,2,26,3,2,81,0.888889,0.111111,0.000000,9.0,0.111111,0.800000,0.200000,0.000000,5.0,0.200000


In [36]:
test_df.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
X_train.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
X_test.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
test_df.head()

Unnamed: 0,latitude,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7185,-73.9865,2950,5,11,6,8,78,0.458333,0.333333,0.208333,0.75,0.0,1.0,0.0,1.0
1,40.7278,-74.0,2850,6,24,3,3,35,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772
2,40.7306,-73.989,3758,4,3,3,6,333,0.0,1.0,0.0,1.0,0.333333,0.666667,0.0,0.666667
3,40.7109,-73.9571,3300,6,11,10,6,204,0.245902,0.393443,0.360656,1.114754,0.300489,0.145859,0.553652,0.592207
4,40.765,-73.9845,4900,5,12,14,7,174,0.597222,0.319444,0.083333,0.486111,0.59434,0.349057,0.056604,0.462264


In [37]:
X_train.head()

Unnamed: 0,latitude,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7479,-73.9739,3050,3,28,11,3,92,0.686538,0.245559,0.067904,0.381366,0.659091,0.295455,0.045455,0.386364
1,40.7601,-73.99,2099,5,24,9,4,117,0.354167,0.520833,0.125,0.770833,0.646617,0.293233,0.06015,0.413534
2,40.7593,-73.9959,5067,2,15,8,6,114,0.905767,0.088608,0.005626,0.099859,0.647799,0.308176,0.044025,0.396226
3,40.7315,-74.0025,3700,3,7,3,4,24,0.672131,0.245902,0.081967,0.409836,0.3,0.6,0.1,0.8
4,40.7298,-73.9816,2950,1,22,3,0,64,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772


In [38]:
X_test.head()

Unnamed: 0,latitude,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7436,-73.9727,3482,2,29,4,6,35,1.0,0.0,0.0,0.0,0.60733,0.319372,0.073298,0.465969
1,40.7378,-74.0084,4700,5,17,5,3,89,0.571429,0.357143,0.071429,0.5,0.8,0.2,0.0,0.2
2,40.796,-73.9742,4800,4,22,9,6,145,0.686538,0.245559,0.067904,0.381366,0.648148,0.342593,0.009259,0.361111
3,40.781,-73.9529,2700,5,15,3,5,112,0.36193,0.490617,0.147453,0.785523,0.423077,0.461538,0.115385,0.692308
4,40.732,-73.9745,3325,5,3,5,0,1,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772


In [39]:
int_lev_dict = {'low': 2, 'medium': 1, 'high': 0}

y_train = y_train.apply(lambda x: int_lev_dict[x])

y_test = y_test.apply(lambda x: int_lev_dict[x])

### XgBoost and parameter tuning:
Parameter optimization using GridSearchCV and RandomizedSearchCV was left out from this notebook,
because it takes a very long time. I implemented it in another notebook, which I will provide you with.

In [40]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics       
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  
from time import perf_counter as timer
from sklearn.metrics import log_loss

import matplotlib.pylab as plt
%matplotlib inline

In [41]:
def modelfit(params, X_train, y_train, X_test, y_test, early_stopping_rounds = 20, cv_folds=5):
    
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    
    xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
    
    cvresult = xgb.cv(params, xgtrain, num_boost_round=1000, nfold=cv_folds, metrics='merror',
                      early_stopping_rounds=early_stopping_rounds)
    
    eval_set =[(X_test, y_test)]
    
    model = xgb.train(params,xgtrain,num_boost_round=cvresult.shape[0],
                      evals=[(xgtrain,'train'),(xgtest,'test')], early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
    
    train_preds = model.predict(xgtrain)
    test_preds = model.predict(xgtest)
    
    print('MODEL REPORT: \n')
    print('Optimal number of boosting rounds = ' + str(cvresult.shape[0]))
    print("Log loss on train set : %.6g" % log_loss(y_train, train_preds))
    print("Log loss on test set : %.6g" % log_loss(y_test, test_preds))
    #print("Model evaluation history: \n" + str(cvresult))
    
    return cvresult.shape[0]

In [42]:
params_1 = {
 'learning_rate':0.1,
 'max_depth':5,
 'min_child_weight':1,
 'gamma':0,
 'subsample':0.8,
 'colsample_bytree':0.8,
 'nthread':4,
 'objective':'multi:softprob',
 'num_class':3,
 'scale_pos_weight':1,
 'seed':27}

num_boost_rounds = modelfit(params_1, X_train, y_train, X_test, y_test)

[0]	train-merror:0.196979	test-merror:0.198947
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 20 rounds.
[1]	train-merror:0.195763	test-merror:0.199554
[2]	train-merror:0.195245	test-merror:0.199149
[3]	train-merror:0.195245	test-merror:0.199352
[4]	train-merror:0.195335	test-merror:0.198744
[5]	train-merror:0.195448	test-merror:0.199554
[6]	train-merror:0.194637	test-merror:0.200972
[7]	train-merror:0.194524	test-merror:0.199149
[8]	train-merror:0.194007	test-merror:0.197731
[9]	train-merror:0.194097	test-merror:0.198541
[10]	train-merror:0.192948	test-merror:0.197123
[11]	train-merror:0.193084	test-merror:0.197528
[12]	train-merror:0.192498	test-merror:0.195502
[13]	train-merror:0.192115	test-merror:0.195097
[14]	train-merror:0.192453	test-merror:0.1953
[15]	train-merror:0.192363	test-merror:0.194287
[16]	train-merror:0.192296	test-merror:0.194489
[17]	train-merror:0.192273	test-merror:0.194895
[

##### Running XgBoost in the cell below is going to take between 5 and 10 minutes depending on your machine.

In [43]:
params_2 = {
 'learning_rate':0.01,
 'max_depth':5,
 'min_child_weight':3,
 'gamma':0.2,
 'subsample':0.7,
 'colsample_bytree':0.9,
 'reg_alpha':0.7,
 'reg_lambda':0.001,
 'nthread':4,
 'objective':'multi:softprob',
 'num_class':3,
 'scale_pos_weight':1,
 'max_delta_step':1,
 'seed':27}

num_boost_rounds = modelfit(params_2, X_train, y_train, X_test, y_test)

[0]	train-merror:0.196889	test-merror:0.198744
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 20 rounds.
[1]	train-merror:0.195673	test-merror:0.199554
[2]	train-merror:0.195785	test-merror:0.198947
[3]	train-merror:0.195965	test-merror:0.199959
[4]	train-merror:0.195177	test-merror:0.198947
[5]	train-merror:0.194952	test-merror:0.199554
[6]	train-merror:0.194705	test-merror:0.198541
[7]	train-merror:0.19466	test-merror:0.197731
[8]	train-merror:0.194479	test-merror:0.197528
[9]	train-merror:0.194164	test-merror:0.197731
[10]	train-merror:0.194142	test-merror:0.196515
[11]	train-merror:0.194187	test-merror:0.197528
[12]	train-merror:0.194434	test-merror:0.197528
MODEL REPORT: 

Optimal number of boosting rounds = 13
Log loss on train set : 1.00503
Log loss on test set : 1.00524


### Final training and predicting:

#### Recombining the splitted datasets, so they can be used for final training:

In [44]:
xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)

xgtest = xgb.DMatrix(test_df.values)
    
xgeval = xgb.DMatrix(X_test.values, label=y_test.values)

In [45]:
model = xgb.train(params_1,xgtrain,num_boost_round=num_boost_rounds,
                    evals=[(xgtrain,'train'),(xgeval,'test')], early_stopping_rounds=20, verbose_eval=True)

preds = model.predict(xgtest)

[0]	train-merror:0.196979	test-merror:0.198947
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 20 rounds.
[1]	train-merror:0.195763	test-merror:0.199554
[2]	train-merror:0.195245	test-merror:0.199149
[3]	train-merror:0.195245	test-merror:0.199352
[4]	train-merror:0.195335	test-merror:0.198744
[5]	train-merror:0.195448	test-merror:0.199554
[6]	train-merror:0.194637	test-merror:0.200972
[7]	train-merror:0.194524	test-merror:0.199149
[8]	train-merror:0.194007	test-merror:0.197731
[9]	train-merror:0.194097	test-merror:0.198541
[10]	train-merror:0.192948	test-merror:0.197123
[11]	train-merror:0.193084	test-merror:0.197528
[12]	train-merror:0.192498	test-merror:0.195502


In [46]:

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]

out_df["listing_id"] = test_df_id.values
out_df = out_df[['listing_id','high','medium','low']]

out_df.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.10941,0.786737,0.103853
1,7210040,0.105862,0.105097,0.789041
2,7103890,0.144745,0.658873,0.196383
3,7143442,0.48588,0.275811,0.238309
4,6860601,0.169763,0.365862,0.464375


In [47]:
out_df.to_csv("xgb_starter2.csv", index=False)