In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

### Feature Engineering:

In [3]:
# dropping irrelevant features
train_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)
test_df.drop(['bathrooms', 'bedrooms', 'street_address', 'display_address'], axis=1, inplace=True)

In [4]:
# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df.head()

Unnamed: 0,building_id,created,description,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price
10,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000
10000,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465
100004,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...","[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850
100007,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275
100013,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350


In [5]:
# creating a created_hour feature
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# and a created_day feature
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day

In [6]:
# creating a column for the number of features of each house
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

In [7]:
# creating a column for the number of photos of each house
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [8]:
# count of words present in description column 
train_df["num_desc_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_desc_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [9]:
# deleting irrelevant features
train_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)
test_df.drop(['photos', 'features', 'created', 'description'], axis=1, inplace=True)

In [10]:
# Let's split the data
y = train_df["interest_level"]
X = train_df.drop(['interest_level'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#### Dealing with manager_id:

In [11]:
temp = pd.concat([X_train['manager_id'],pd.get_dummies(y_train)], axis = 1).groupby('manager_id').mean()
temp.columns = ['man_high_frac','man_low_frac', 'man_medium_frac']
temp['man_count'] = X_train.groupby('manager_id').count().iloc[:,1]
temp.tail(10)

Unnamed: 0_level_0,man_high_frac,man_low_frac,man_medium_frac,man_count
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ff50149b1ec6a10fbb386d761c7a4bc5,0.0,0.0,1.0,1
ff52574a198d8a349f41d57bd374e8e9,0.666667,0.166667,0.166667,6
ff564542a40d3e9b970a494df91b4cee,0.0,1.0,0.0,1
ff5a3509ebce238dfa4857839ddd24ce,0.0,1.0,0.0,1
ff707b7f0a2ea94a26c784f06c9fec69,0.0,0.75,0.25,4
ff810b43aecfacd00f20a2912ff1e288,0.0,1.0,0.0,1
ff8f1175d95c5a866ef73399cd54a1a1,0.0,1.0,0.0,7
ffb9eceeaa4d60da7b41cc8f0954cd50,0.0,1.0,0.0,1
ffd3c60281949a5cd806030203a79558,0.0,0.75,0.25,4
ffd7307e4924587b30ec014a95db1362,0.036585,0.670732,0.292683,82


In [12]:
# compute skill
temp['manager_skill'] = temp['man_high_frac']*2 + temp['man_medium_frac']

# get ixes for unranked managers...
unranked_managers_ixes = temp['man_count']<20

# ... and ranked ones
ranked_managers_ixes = ~unranked_managers_ixes

In [13]:
# compute mean values from ranked managers and assign them to unranked ones
mean_values = temp.loc[ranked_managers_ixes, ['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']].mean()
print(mean_values)

#temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values
temp.tail(10)

man_high_frac      0.082822
man_low_frac       0.664291
man_medium_frac    0.252887
manager_skill      0.418531
dtype: float64


Unnamed: 0_level_0,man_high_frac,man_low_frac,man_medium_frac,man_count,manager_skill
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ff50149b1ec6a10fbb386d761c7a4bc5,0.0,0.0,1.0,1,1.0
ff52574a198d8a349f41d57bd374e8e9,0.666667,0.166667,0.166667,6,1.5
ff564542a40d3e9b970a494df91b4cee,0.0,1.0,0.0,1,0.0
ff5a3509ebce238dfa4857839ddd24ce,0.0,1.0,0.0,1,0.0
ff707b7f0a2ea94a26c784f06c9fec69,0.0,0.75,0.25,4,0.25
ff810b43aecfacd00f20a2912ff1e288,0.0,1.0,0.0,1,0.0
ff8f1175d95c5a866ef73399cd54a1a1,0.0,1.0,0.0,7,0.0
ffb9eceeaa4d60da7b41cc8f0954cd50,0.0,1.0,0.0,1,0.0
ffd3c60281949a5cd806030203a79558,0.0,0.75,0.25,4,0.25
ffd7307e4924587b30ec014a95db1362,0.036585,0.670732,0.292683,82,0.365854


In [14]:
X_train = X_train.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_train.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_high_frac,man_low_frac,man_medium_frac,man_count,manager_skill
0,8d83f6be6675419b19a6fc29a6ce6cee,40.7701,6930401,-73.955,d13ddb71931663b61d2c105617d0c9cf,5300,3,27,4,11,138,0.133333,0.733333,0.133333,15,0.4
1,e243886a4cac69881c07c6a1f38185bf,40.7655,6843131,-73.9785,381040570cc863b743a9fce1a6ae9724,3400,3,8,3,6,76,0.025641,0.905983,0.068376,117,0.119658
2,daa8602f26a65e59d7bf579b06342860,40.7565,7062871,-73.9973,3fc15c3a17757c4cfd76120ad9c86583,2600,2,25,13,4,133,0.0,0.678571,0.321429,28,0.321429
3,0,40.7604,7169616,-73.987,5ed231677a1f24402d5670af8bb9f46b,3330,4,16,5,3,80,0.0,1.0,0.0,5,0.0
4,8e38b5a741f7e7415e8286063ffd73dc,40.743,7074188,-73.9937,2eb205ff26404c7eaacc71b2b71198f5,5495,2,27,7,6,97,0.111111,0.733333,0.155556,45,0.377778


In [26]:
# add the features computed on the training dataset to the validation dataset
X_test = X_test.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
#new_manager_ixes = X_test['man_high_frac'].isnull()
#X_test.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
X_test.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,...,man_high_frac_x,man_low_frac_x,man_medium_frac_x,man_count_x,manager_skill_x,man_high_frac_y,man_low_frac_y,man_medium_frac_y,man_count_y,manager_skill_y
0,0,40.8612,6835670,-73.9287,037cb431f3c661b2f7ed4b889f95f3a9,1800,1,7,3,0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,79fe30ac190d7b72de8ab14e04221068,40.7562,7150634,-73.9964,7533621a882f71e25173b27e3139d83d,4770,12,12,3,12,...,0.0,0.981818,0.018182,55.0,0.018182,0.0,0.981818,0.018182,55.0,0.018182
2,9511903feae56d0e8537696f1ed1a8ad,40.7081,7201589,-74.0013,1fb46c4a72bcf764ac35fc23f394760d,3740,6,22,11,6,...,0.214286,0.367347,0.418367,196.0,0.846939,0.214286,0.367347,0.418367,196.0,0.846939
3,6b3fbd289f786a2fdac1bf8aed04060e,40.7555,7211410,-73.9666,29615d444c968e27d56581c1f94cf82d,4000,8,24,5,4,...,0.066667,0.333333,0.6,15.0,0.733333,0.066667,0.333333,0.6,15.0,0.733333
4,61e8954b2236d9a685396c8656e3c59c,40.7457,6990310,-73.9785,d1762ef0af965cfb5946ba0e209cc1c5,3750,4,10,4,0,...,0.0,0.985915,0.014085,71.0,0.014085,0.0,0.985915,0.014085,71.0,0.014085


In [16]:
# add the features computed on the training dataset to the validation dataset
test_df = test_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
new_manager_ixes = test_df['man_high_frac'].isnull()
test_df.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
test_df.head()

Unnamed: 0,building_id,latitude,listing_id,longitude,manager_id,price,created_hour,created_day,num_features,num_photos,num_desc_words,man_high_frac,man_low_frac,man_medium_frac,man_count,manager_skill
0,79780be1514f645d7e6be99a3de696c5,40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,5,11,6,8,78,0.235294,0.411765,0.352941,17.0,0.823529
1,0,40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,2850,6,24,3,3,35,0.0,1.0,0.0,4.0,0.0
2,3dbbb69fd52e0d25131aa1cd459c87eb,40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,3758,4,3,3,6,333,0.0,0.0,1.0,1.0,1.0
3,783d21d013a7e655bddc4ed0d461cc5e,40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,3300,6,11,10,6,204,0.361702,0.212766,0.425532,47.0,1.148936
4,6134e7c4dd1a98d9aee36623c9872b49,40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,4900,5,12,14,7,174,0.069767,0.55814,0.372093,43.0,0.511628


#### Dealing with building_id in a similar way as manager_id:

In [17]:
bld_train_list = train_df.building_id.unique()
bld_test_list = test_df.building_id.unique()
bld_list = np.concatenate((bld_train_list,bld_test_list), axis=0)
bld_list = list(set(bld_list))

In [18]:

df100 = train_df[['building_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,building_id,low,medium,high
10,53a5b119ba8f7b61d4e010512e0dfc85,0,1,0
10000,c5c8a357cba207596b04d1afd1e4f130,1,0,0
100004,c3ba40552e2120b0acfc3cb5730bb2aa,0,0,1
100007,28d9ad350afeaab8027513a3e52ac8d5,1,0,0
100013,0,1,0,0


In [19]:
gby = pd.concat([df100.groupby('building_id').mean(),df100.groupby('building_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['bd_low_freq','bd_medium_freq','bd_high_freq','count']

In [20]:
gby['bd_avg_interest'] = gby['bd_medium_freq']*1 + gby['bd_high_freq']*2
gby.sort_values(by = 'count', ascending = False).head()

Unnamed: 0_level_0,bd_low_freq,bd_medium_freq,bd_high_freq,count,bd_avg_interest
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.915762,0.060705,0.023534,8286,0.107772
96274288c84ddd7d5c5d8e425ee75027,0.650909,0.272727,0.076364,275,0.425455
11e1dec9d14b1a9e528386a2504b3afc,0.674419,0.24186,0.083721,215,0.409302
80a120d6bc3aba97f40fee8c2204524b,0.586854,0.328638,0.084507,213,0.497653
bb8658a3e432fb62a440615333376345,0.523585,0.353774,0.122642,212,0.599057


In [25]:
mean_values = gby[['bd_low_freq','bd_medium_freq','bd_high_freq','bd_avg_interest']].mean()
mean_values

bd_low_freq        0.553652
bd_medium_freq     0.300489
bd_high_freq       0.145859
bd_avg_interest    0.592207
dtype: float64

In [None]:
# add the features computed on the training dataset to the validation dataset
X_test = X_test.merge(temp.reset_index(),how='left', left_on='building_id', right_on='building_id')
new_manager_ixes = X_test['man_high_frac'].isnull()
X_test.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
X_test.head()