In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [3]:
test_df.shape

(74659, 14)

### Feature Engineering:

In [4]:
# dropping irrelevant features
train_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)
test_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)

In [5]:
test_df.shape

(74659, 10)

In [6]:
test_df.head()

Unnamed: 0,building_id,created,description,features,latitude,listing_id,longitude,manager_id,photos,price
0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950
1,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850
100,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,"[Doorman, Elevator, No Fee]",40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758
1000,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300
100000,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900


In [7]:
# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df.head()

Unnamed: 0,building_id,created,description,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price
10,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000
10000,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465
100004,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...","[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850
100007,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275
100013,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350


In [8]:
test_df.head()

Unnamed: 0,building_id,created,description,features,latitude,listing_id,longitude,manager_id,photos,price
0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950
1,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850
100,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,"[Doorman, Elevator, No Fee]",40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758
1000,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300
100000,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900


In [9]:
# creating a created_hour feature
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# and a created_day feature
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day

In [10]:
# creating a column for the number of features of each house
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

In [11]:
# creating a column for the number of photos of each house
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [12]:
# count of words present in description column 
train_df["num_desc_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_desc_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [13]:
# deleting irrelevant features
train_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)
test_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)

In [14]:
# Let's split the data
y = train_df["interest_level"]
X = train_df.drop(['interest_level'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#### Dealing with manager_id:

In [15]:
man_train_list = train_df['manager_id'].unique()
man_test_list = test_df['manager_id'].unique()
man_list = np.concatenate((man_train_list,man_test_list), axis=0)
man_list = list(set(man_list))

In [16]:
df100 = train_df[['manager_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,manager_id,low,medium,high
10,5ba989232d0489da1b5f2c45f6688adc,0,1,0
10000,7533621a882f71e25173b27e3139d83d,1,0,0
100004,d9039c43983f6e564b1482b273bd7b01,0,0,1
100007,1067e078446a7897d2da493d2f741316,1,0,0
100013,98e13ad4b495b9613cef886d79a6291f,1,0,0


In [17]:
test_df.shape

(74659, 11)

In [18]:
gby = pd.concat([df100.groupby('manager_id').mean(),df100.groupby('manager_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['man_low_frac','man_medium_frac','man_high_frac','man_count']

In [19]:
gby['manager_skill'] = gby['man_medium_frac']*1 + gby['man_high_frac']*2
gby.sort_values(by = 'man_count', ascending = False).head()

Unnamed: 0_level_0,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e6472c7237327dd3903b3d6f6a94515a,0.686538,0.245559,0.067904,2533,0.381366
6e5c10246156ae5bdcd9b487ca99d96a,0.905767,0.088608,0.005626,711,0.099859
8f5a9c893f6d602f4953fcc0b8e6e9b4,0.987805,0.009756,0.002439,410,0.014634
62b685cc0d876c3a1a51d63a0d6a8082,1.0,0.0,0.0,402,0.0
cb87dadbca78fad02b388dc9e8f25a5b,0.36193,0.490617,0.147453,373,0.785523


In [20]:

mean_values = gby[['man_low_frac','man_medium_frac','man_high_frac','manager_skill']].mean()
mean_values

man_low_frac       0.722564
man_medium_frac    0.191016
man_high_frac      0.086419
manager_skill      0.363855
dtype: float64

In [21]:
# add the features computed on the training dataset to the train dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_train.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,0,40.724,6872356,-74.0035,151f60a5367a01a69260b3542570ace8,12500,4,14,15,8,248,1.0,0.0,0.0,9,0.0
1,a975110cd3606fb8cc37eeba4b3db8dc,40.7519,7201929,-73.9947,ec72d4f9fbf500ceb746683a3444bf87,3275,13,22,8,5,56,0.0,1.0,0.0,1,1.0
2,e9a5e5ae1a5a8d28dd9b1dc44cb2c3bb,40.778,6813996,-73.9563,9d32b720e26a351b951c8f78f72f2fec,2100,2,2,6,3,69,0.890244,0.103659,0.006098,164,0.115854
3,ba6bef9d045edcdc04bce07cd86f125b,40.7796,7006399,-73.9493,c1a6598437b7db560cde66e5a297a53f,2200,4,13,4,5,70,0.511628,0.302326,0.186047,86,0.674419
4,9c926bb8e5354ae22f3ebaaa3d1ee282,40.7337,6996929,-73.9897,be563466c0c0a5b295db3822c1c5e289,3725,6,11,7,4,179,0.65,0.3,0.05,60,0.4


In [22]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_test.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,c02b292660a08fb0f1a3d49af7a94341,40.7036,6913796,-73.8631,e6472c7237327dd3903b3d6f6a94515a,2699,15,22,1,8,32,0.686538,0.245559,0.067904,2533,0.381366
1,300c5229c48b49cd3c8cf80fe61b853b,40.7356,7234346,-73.9828,25d4ea3f8ec14332bcf177e416c6747d,3850,17,29,8,4,25,0.75,0.25,0.0,4,0.25
2,91f3b911fe7b76d6089b6ef04b9bf23e,40.6941,7019835,-73.8621,860769c49aadbc9dda267bc9a80d2eda,1700,5,16,6,9,46,0.552632,0.289474,0.157895,38,0.605263
3,1848e582bc4599c1048fe99e3b250b94,40.7611,7069646,-73.9504,0e256ff13bc97e3cf7557f51870bb213,2450,2,26,5,7,106,0.25,0.25,0.5,12,1.25
4,0,40.7645,7096672,-73.984,0f57223e3bbd5222d9881a442aba0d03,4100,3,2,7,6,92,1.0,0.0,0.0,85,0.0


In [23]:
gby.shape

(3481, 5)

In [24]:
test_df.shape

(74659, 11)

In [25]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

new_manager_ixes = test_df['man_high_frac'].isnull()
test_df.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
test_df.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,79780be1514f645d7e6be99a3de696c5,40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.75
1,0,40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.0,0.0,0.0,9.0,0.0
2,3dbbb69fd52e0d25131aa1cd459c87eb,40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,3758,4,3,3,6,333,0.0,1.0,0.0,1.0,1.0
3,783d21d013a7e655bddc4ed0d461cc5e,40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,3300,6,11,10,6,204,0.245902,0.393443,0.360656,61.0,1.114754
4,6134e7c4dd1a98d9aee36623c9872b49,40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,4900,5,12,14,7,174,0.597222,0.319444,0.083333,72.0,0.486111


In [26]:
test_df.shape

(74659, 16)

#### Dealing with building_id in a similar way as manager_id:

In [27]:
bld_train_list = train_df['building_id'].unique()
bld_test_list = test_df['building_id'].unique()
bld_list = np.concatenate((bld_train_list,bld_test_list), axis=0)
bld_list = list(set(bld_list))

In [28]:

df100 = train_df[['building_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,building_id,low,medium,high
10,53a5b119ba8f7b61d4e010512e0dfc85,0,1,0
10000,c5c8a357cba207596b04d1afd1e4f130,1,0,0
100004,c3ba40552e2120b0acfc3cb5730bb2aa,0,0,1
100007,28d9ad350afeaab8027513a3e52ac8d5,1,0,0
100013,0,1,0,0


In [29]:
gby = pd.concat([df100.groupby('building_id').mean(),df100.groupby('building_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['bd_low_frac','bd_medium_frac','bd_high_frac','bd_count']

In [30]:
gby['bd_avg_interest'] = gby['bd_medium_frac']*1 + gby['bd_high_frac']*2
gby.sort_values(by = 'bd_count', ascending = False).head()

Unnamed: 0_level_0,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.915762,0.060705,0.023534,8286,0.107772
96274288c84ddd7d5c5d8e425ee75027,0.650909,0.272727,0.076364,275,0.425455
11e1dec9d14b1a9e528386a2504b3afc,0.674419,0.24186,0.083721,215,0.409302
80a120d6bc3aba97f40fee8c2204524b,0.586854,0.328638,0.084507,213,0.497653
bb8658a3e432fb62a440615333376345,0.523585,0.353774,0.122642,212,0.599057


In [31]:
mean_values = gby[['bd_low_frac','bd_medium_frac','bd_high_frac','bd_avg_interest']].mean()
mean_values

bd_low_frac        0.553652
bd_medium_frac     0.300489
bd_high_frac       0.145859
bd_avg_interest    0.592207
dtype: float64

In [32]:
# add the features computed on the training dataset to the training dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
X_train.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,...,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,0,40.724,6872356,-74.0035,151f60a5367a01a69260b3542570ace8,12500,4,14,15,8,...,1.0,0.0,0.0,9,0.0,0.915762,0.060705,0.023534,8286,0.107772
1,a975110cd3606fb8cc37eeba4b3db8dc,40.7519,7201929,-73.9947,ec72d4f9fbf500ceb746683a3444bf87,3275,13,22,8,5,...,0.0,1.0,0.0,1,1.0,0.0,1.0,0.0,1,1.0
2,e9a5e5ae1a5a8d28dd9b1dc44cb2c3bb,40.778,6813996,-73.9563,9d32b720e26a351b951c8f78f72f2fec,2100,2,2,6,3,...,0.890244,0.103659,0.006098,164,0.115854,0.411765,0.470588,0.117647,17,0.705882
3,ba6bef9d045edcdc04bce07cd86f125b,40.7796,7006399,-73.9493,c1a6598437b7db560cde66e5a297a53f,2200,4,13,4,5,...,0.511628,0.302326,0.186047,86,0.674419,0.5,0.375,0.125,8,0.625
4,9c926bb8e5354ae22f3ebaaa3d1ee282,40.7337,6996929,-73.9897,be563466c0c0a5b295db3822c1c5e289,3725,6,11,7,4,...,0.65,0.3,0.05,60,0.4,0.768293,0.170732,0.060976,82,0.292683


In [33]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,79780be1514f645d7e6be99a3de696c5,40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.750000
1,0,40.7278,7210040,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.000000,0.000000,0.000000,9.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,67ab535b820c8292ac59cfcffd8974e3,40.7792,6884758,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,15,16,3,3,203,0.684211,0.236842,0.078947,38.0,0.394737
74658,be97e14c554ba6a01d26243ca5eefb82,40.7145,6924212,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,2,26,3,2,81,0.888889,0.111111,0.000000,9.0,0.111111


In [34]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
new_manager_ixes = test_df['bd_high_frac'].isnull()
test_df.loc[new_manager_ixes,['bd_high_frac','bd_low_frac', 'bd_medium_frac','bd_avg_interest']] = mean_values.values

with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,79780be1514f645d7e6be99a3de696c5,40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.458333,0.333333,0.208333,24.0,0.750000,0.000000,1.000000,0.000000,3.0,1.000000
1,0,40.7278,7210040,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,1.000000,0.000000,0.000000,9.0,0.000000,0.915762,0.060705,0.023534,8286.0,0.107772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,67ab535b820c8292ac59cfcffd8974e3,40.7792,6884758,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,15,16,3,3,203,0.684211,0.236842,0.078947,38.0,0.394737,1.000000,0.000000,0.000000,3.0,0.000000
74658,be97e14c554ba6a01d26243ca5eefb82,40.7145,6924212,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,2,26,3,2,81,0.888889,0.111111,0.000000,9.0,0.111111,0.800000,0.200000,0.000000,5.0,0.200000


In [35]:
test_df.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
X_train.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
X_test.drop(['manager_id','man_count', 'bd_count', 'building_id'], axis=1, inplace=True)
test_df.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7185,7142618,-73.9865,2950,5,11,6,8,78,0.458333,0.333333,0.208333,0.75,0.0,1.0,0.0,1.0
1,40.7278,7210040,-74.0,2850,6,24,3,3,35,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772
2,40.7306,7103890,-73.989,3758,4,3,3,6,333,0.0,1.0,0.0,1.0,0.333333,0.666667,0.0,0.666667
3,40.7109,7143442,-73.9571,3300,6,11,10,6,204,0.245902,0.393443,0.360656,1.114754,0.300489,0.145859,0.553652,0.592207
4,40.765,6860601,-73.9845,4900,5,12,14,7,174,0.597222,0.319444,0.083333,0.486111,0.59434,0.349057,0.056604,0.462264


In [36]:
X_train.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.724,6872356,-74.0035,12500,4,14,15,8,248,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772
1,40.7519,7201929,-73.9947,3275,13,22,8,5,56,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,40.778,6813996,-73.9563,2100,2,2,6,3,69,0.890244,0.103659,0.006098,0.115854,0.411765,0.470588,0.117647,0.705882
3,40.7796,7006399,-73.9493,2200,4,13,4,5,70,0.511628,0.302326,0.186047,0.674419,0.5,0.375,0.125,0.625
4,40.7337,6996929,-73.9897,3725,6,11,7,4,179,0.65,0.3,0.05,0.4,0.768293,0.170732,0.060976,0.292683


In [37]:
X_test.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.7036,6913796,-73.8631,2699,15,22,1,8,32,0.686538,0.245559,0.067904,0.381366,1.0,0.0,0.0,0.0
1,40.7356,7234346,-73.9828,3850,17,29,8,4,25,0.75,0.25,0.0,0.25,0.5,0.5,0.0,0.5
2,40.6941,7019835,-73.8621,1700,5,16,6,9,46,0.552632,0.289474,0.157895,0.605263,0.666667,0.333333,0.0,0.333333
3,40.7611,7069646,-73.9504,2450,2,26,5,7,106,0.25,0.25,0.5,1.25,0.470588,0.294118,0.235294,0.764706
4,40.7645,7096672,-73.984,4100,3,2,7,6,92,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772


In [38]:
int_lev_dict = {'low': 2, 'medium': 1, 'high': 0}

y_train = y_train.apply(lambda x: int_lev_dict[x])

y_test = y_test.apply(lambda x: int_lev_dict[x])

### XgBoost and parameter tuning:
Parameter optimization using GridSearchCV and RandomizedSearchCV was left out from this notebook,
because it takes a very long time. I implemented it in another notebook, which I will provide you with.

In [39]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics       
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  
from time import perf_counter as timer

import matplotlib.pylab as plt
%matplotlib inline

In [40]:
def modelfit(alg, X_train, y_train, X_test, y_test, early_stopping_rounds = 50, cv_folds=5):
    
    xgb_param = alg.get_xgb_params()
    xgb_param['objective'] = 'multi:softprob'
    xgb_param['num_class'] = 3
    xgb_param['silent'] = 0
    
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
    
    alg.set_params(n_estimators=cvresult.shape[0])
        
    alg.fit(X_train, y_train)
    
    train_preds = alg.predict(X_train)
    test_preds = alg.predict(X_test)
    
    print('MODEL REPORT: \n')
    print('Number of estimators = ' + str(cvresult.shape[0]))
    print("Accuracy on train set : %.6g" % metrics.accuracy_score(y_train, train_preds))
    print("Accuracy on test set : %.6g" % metrics.accuracy_score(y_test, test_preds))

In [41]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb1, X_train, y_train, X_test, y_test)

  if diff:


MODEL REPORT: 

Number of estimators = 137
Accuracy on train set : 0.834901
Accuracy on test set : 0.806164


  if diff:


##### Running XgBoost in the cell below is going to take between 5 and 10 minutes depending on your machine.

In [42]:
xgb_final = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=5,
 min_child_weight=3,
 gamma=0.2,
 subsample=0.7,
 colsample_bytree=0.9,
 reg_alpha=0.7,
 reg_lambda=0.001,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb_final, X_train, y_train, X_test, y_test)

  if diff:


MODEL REPORT: 

Number of estimators = 1267
Accuracy on train set : 0.834387
Accuracy on test set : 0.808866


  if diff:


In [43]:
params ={
 'eta':0.01,
 'max_depth':5,
 'n_estimators':5000,
 'min_child_weight':3,
 'gamma':0.2,
 'nthread':4,
 'subsample':0.7,
 'colsample_bytree':0.9,
 'reg_alpha':0.7,
 'reg_lambda':0.001,
 'scale_pos_weight':1,
 'objective':'multi:softprob',
 'num_class':3,
 'seed':27}

### Final training and predicting:

#### Recombining the splitted datasets, so they can be used for final training:

In [44]:
test_df.shape

(74659, 17)

In [45]:
train_df = pd.concat([X_train, X_test], ignore_index=True)

target_df = pd.concat([y_train, y_test], ignore_index=True)

train_df.head()

Unnamed: 0,latitude,listing_id,longitude,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_low_frac,man_medium_frac,man_high_frac,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_avg_interest
0,40.724,6872356,-74.0035,12500,4,14,15,8,248,1.0,0.0,0.0,0.0,0.915762,0.060705,0.023534,0.107772
1,40.7519,7201929,-73.9947,3275,13,22,8,5,56,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,40.778,6813996,-73.9563,2100,2,2,6,3,69,0.890244,0.103659,0.006098,0.115854,0.411765,0.470588,0.117647,0.705882
3,40.7796,7006399,-73.9493,2200,4,13,4,5,70,0.511628,0.302326,0.186047,0.674419,0.5,0.375,0.125,0.625
4,40.7337,6996929,-73.9897,3725,6,11,7,4,179,0.65,0.3,0.05,0.4,0.768293,0.170732,0.060976,0.292683


In [48]:

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]

out_df["listing_id"] = test_list_id.values
out_df = out_df[['listing_id','high','medium','low']]

out_df.head()
out_df.shape

(74659, 4)

In [49]:
out_df.to_csv("xgb_starter2.csv", index=False)