In [1]:
import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics, preprocessing 
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [3]:
test_df.shape

(74659, 14)

### Feature Engineering:

In [4]:
test_df_id = test_df['listing_id']

In [5]:
# dropping irrelevant features
train_df.drop(['listing_id'], axis=1, inplace=True)
test_df.drop(['listing_id'], axis=1, inplace=True)

In [6]:
test_df.shape

(74659, 13)

In [7]:
test_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
100,1.0,1,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,101 East 10th Street,"[Doorman, Elevator, No Fee]",40.7306,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758,101 East 10th Street
1000,1.0,2,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,South Third Street\r,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300,251 South Third Street\r
100000,2.0,2,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","Midtown West, 8th Ave","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900,260 West 54th Street


In [8]:
# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [9]:
test_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
100,1.0,1,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,101 East 10th Street,"[Doorman, Elevator, No Fee]",40.7306,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758,101 East 10th Street
1000,1.0,2,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,South Third Street\r,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300,251 South Third Street\r
100000,2.0,2,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","Midtown West, 8th Ave","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900,260 West 54th Street


In [10]:
# creating a created_hour feature
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# and a created_day feature
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day

In [11]:
# creating a column for the number of features of each house
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

In [12]:
# creating a column for the number of photos of each house
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [13]:
# count of words present in description column 
train_df["num_desc_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_desc_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [14]:
# deleting irrelevant features
train_df.drop(['photos', 'created', 'description'], axis=1, inplace=True)
test_df.drop(['photos', 'created', 'description'], axis=1, inplace=True)

In [15]:
categorical = ["display_address", "street_address"]

for f in categorical:
    if train_df[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
    
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

In [16]:
feature_transform = CountVectorizer(stop_words='english', max_features=150)

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))

feature_transform.fit(list(train_df['features']) + list(test_df['features']))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
def transform_data(X):
    
    feat_sparse = feature_transform.transform(X["features"])
    vocabulary = feature_transform.vocabulary_
    del X['features']
    
    X1 = pd.DataFrame([ pd.Series(feat_sparse[i].toarray().ravel()) for i in np.arange(feat_sparse.shape[0]) ])
    X1.columns = list(sorted(vocabulary.keys()))
    X = pd.concat([X.reset_index(), X1.reset_index()], axis = 1)
    del X['index']
    return X

In [18]:
train_df = transform_data(train_df)
test_df = transform_data(test_df)

In [19]:
# Let's split the data
y = train_df["interest_level"]
X = train_df.drop(['interest_level'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

#### Dealing with manager_id:

In [20]:
man_train_list = train_df['manager_id'].unique()
man_test_list = test_df['manager_id'].unique()
man_list = np.concatenate((man_train_list,man_test_list), axis=0)
man_list = list(set(man_list))

In [21]:
df100 = train_df[['manager_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,manager_id,low,medium,high
0,5ba989232d0489da1b5f2c45f6688adc,0,1,0
1,7533621a882f71e25173b27e3139d83d,1,0,0
2,d9039c43983f6e564b1482b273bd7b01,0,0,1
3,1067e078446a7897d2da493d2f741316,1,0,0
4,98e13ad4b495b9613cef886d79a6291f,1,0,0


In [22]:
test_df.shape

(74659, 164)

In [23]:
X_train.shape

(37014, 164)

In [24]:
gby = pd.concat([df100.groupby('manager_id').mean(),df100.groupby('manager_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['man_low_frac','man_medium_frac','man_high_frac','man_count']
gby.shape

(3481, 4)

In [25]:
gby['manager_skill'] = gby['man_medium_frac']*1 + gby['man_high_frac']*2
gby.sort_values(by = 'man_count', ascending = False).head()

Unnamed: 0_level_0,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
manager_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e6472c7237327dd3903b3d6f6a94515a,0.686538,0.245559,0.067904,2533,0.381366
6e5c10246156ae5bdcd9b487ca99d96a,0.905767,0.088608,0.005626,711,0.099859
8f5a9c893f6d602f4953fcc0b8e6e9b4,0.987805,0.009756,0.002439,410,0.014634
62b685cc0d876c3a1a51d63a0d6a8082,1.0,0.0,0.0,402,0.0
cb87dadbca78fad02b388dc9e8f25a5b,0.36193,0.490617,0.147453,373,0.785523


In [26]:
mean_values = gby[['man_low_frac','man_medium_frac','man_high_frac','manager_skill']].mean()
mean_values

man_low_frac       0.722564
man_medium_frac    0.191016
man_high_frac      0.086419
manager_skill      0.363855
dtype: float64

In [27]:
# add the features computed on the training dataset to the train dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_train.tail()

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,...,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
37009,1.0,2,0,14991,40.8002,-73.9539,09cfe06945cd849badee1063e942f28e,3000,3076,3,...,0,0,0,0,0,0.857143,0.142857,0.0,21,0.142857
37010,1.0,1,10b5eee3f20713045d63d6032c8a3a91,14349,40.7687,-73.9867,de09725e4303496d757ed0b77c0baee2,2600,17449,4,...,0,0,0,0,0,0.726027,0.246575,0.027397,73,0.30137
37011,1.0,1,0,11488,40.7074,-74.007,e6472c7237327dd3903b3d6f6a94515a,3640,6270,5,...,0,0,0,0,0,0.686538,0.245559,0.067904,2533,0.381366
37012,1.0,2,0,12372,40.7198,-73.9964,0d16d096ef1256ba3fdd6df87f8e98fe,3600,5022,1,...,0,0,0,0,0,1.0,0.0,0.0,56,0.0
37013,2.0,3,d0933b0c9fb2ecdb0d6e551c41c60a6c,11070,40.7742,-73.9475,8f5a9c893f6d602f4953fcc0b8e6e9b4,5100,19454,7,...,0,0,0,0,0,0.987805,0.009756,0.002439,410,0.014634


In [28]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
X_test.tail()

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,...,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
12333,1.0,2,c5c8a357cba207596b04d1afd1e4f130,9080,40.7947,-73.9667,835a254786118d88c822b4ab68c92e31,5400,23680,5,...,0,0,0,0,0,0.983871,0.016129,0.0,62,0.016129
12334,1.0,1,0,11000,40.771,-73.9591,38e613cd90ba43943211be10168ee175,2850,6883,1,...,0,0,0,0,0,1.0,0.0,0.0,30,0.0
12335,1.0,2,0,15306,40.7609,-73.9921,0f1be9ba74fde58b799ab67255ede9a4,3500,17320,5,...,0,0,0,0,0,0.766667,0.1,0.133333,30,0.366667
12336,1.0,2,ea11299b288bdb7e740fc2dccfc3b140,12690,40.7399,-73.9864,cf36aeba79d8e830f8353e30ab551520,4225,11700,3,...,0,0,0,0,0,0.875,0.125,0.0,32,0.125
12337,1.0,1,7259f2d484b9103a1bf9f78c015e969d,9875,40.7617,-73.9617,23869bdcccdc74cb89cdbb91dabded3b,2400,14808,4,...,0,0,0,0,0,0.972973,0.027027,0.0,37,0.027027


In [29]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

new_manager_ixes = test_df['man_low_frac'].isnull()
test_df.loc[new_manager_ixes,['man_high_frac','man_low_frac', 'man_medium_frac','manager_skill']] = mean_values.values
test_df.tail()

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,...,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
74654,1.0,2,0,7182,40.7267,-73.8569,e8be88fd4ed2b9a12dca618d5e850ae9,2000,24702,1,...,0,0,0,0,0,0.733333,0.166667,0.1,30.0,0.366667
74655,1.0,1,eb4a89fa59ccddd7ff88fa63d2848291,8784,40.7061,-74.0111,8f5a9c893f6d602f4953fcc0b8e6e9b4,3649,10248,18,...,0,0,0,0,0,0.987805,0.009756,0.002439,410.0,0.014634
74656,1.0,0,dec7c3a848a6149cb78cc36357b27886,15356,40.7661,-73.9859,13ddb55a3a9d896b4bcdd538ee0ec067,2195,15281,3,...,0,0,0,0,0,0.525641,0.320513,0.153846,78.0,0.628205
74657,1.0,1,67ab535b820c8292ac59cfcffd8974e3,11108,40.7792,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,14244,15,...,0,0,0,0,0,0.684211,0.236842,0.078947,38.0,0.394737
74658,1.0,2,be97e14c554ba6a01d26243ca5eefb82,12282,40.7145,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,24425,2,...,0,0,0,0,0,0.888889,0.111111,0.0,9.0,0.111111


In [30]:
test_df.shape

(74659, 169)

#### Dealing with building_id in a similar way as manager_id:

In [31]:
bld_train_list = train_df['building_id'].unique()
bld_test_list = test_df['building_id'].unique()
bld_list = np.concatenate((bld_train_list,bld_test_list), axis=0)
bld_list = list(set(bld_list))

In [32]:
df100 = train_df[['building_id','interest_level']]
interest_dummies = pd.get_dummies(df100['interest_level'])
df100 = pd.concat([df100,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
df100.head()

Unnamed: 0,building_id,low,medium,high
0,53a5b119ba8f7b61d4e010512e0dfc85,0,1,0
1,c5c8a357cba207596b04d1afd1e4f130,1,0,0
2,c3ba40552e2120b0acfc3cb5730bb2aa,0,0,1
3,28d9ad350afeaab8027513a3e52ac8d5,1,0,0
4,0,1,0,0


In [33]:
gby = pd.concat([df100.groupby('building_id').mean(),df100.groupby('building_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['bd_low_frac','bd_medium_frac','bd_high_frac','bd_count']

In [34]:
gby['bd_avg_interest'] = gby['bd_medium_frac']*1 + gby['bd_high_frac']*2
gby.sort_values(by = 'bd_count', ascending = False).head()

Unnamed: 0_level_0,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.915762,0.060705,0.023534,8286,0.107772
96274288c84ddd7d5c5d8e425ee75027,0.650909,0.272727,0.076364,275,0.425455
11e1dec9d14b1a9e528386a2504b3afc,0.674419,0.24186,0.083721,215,0.409302
80a120d6bc3aba97f40fee8c2204524b,0.586854,0.328638,0.084507,213,0.497653
bb8658a3e432fb62a440615333376345,0.523585,0.353774,0.122642,212,0.599057


In [35]:
mean_values = gby[['bd_low_frac','bd_medium_frac','bd_high_frac','bd_avg_interest']].mean()
mean_values

bd_low_frac        0.553652
bd_medium_frac     0.300489
bd_high_frac       0.145859
bd_avg_interest    0.592207
dtype: float64

In [36]:
# add the features computed on the training dataset to the training dataset
X_train = X_train.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')
X_train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,...,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,1.0,1,413c7622c62f616b15ca521890effc71,11295,40.8152,-73.9471,136b844f49c4eae017d6cabe8ca03b9d,2150,10235,2,...,0.727273,0.181818,0.090909,66,0.363636,0.0,1.0,0.0,4,1.0
1,1.0,2,44c1ee5d0dcc2c9ee7c3e9e53aab4276,12929,40.7216,-73.9927,41735645e0f8f13993c42894023f8e58,3250,23531,2,...,0.661538,0.284615,0.053846,130,0.392308,0.8125,0.1875,0.0,16,0.1875
2,1.0,1,b352a208c3719bc21ac217d788a8f0f9,12740,40.7354,-74.0056,3bf7bc10ba7d7520f86cbf914d24c323,2689,24893,6,...,0.752294,0.211009,0.036697,109,0.284404,0.705882,0.235294,0.058824,17,0.352941
3,1.0,1,660758853b9dc281603471a2b5e8d662,14473,40.7805,-73.9788,a4e0096175a7f299d0f6e67edf143891,2400,4092,6,...,0.6,0.316667,0.083333,60,0.483333,0.0,1.0,0.0,1,1.0
4,2.0,2,65ce068d10ade85cc097134c0f6836c2,14377,40.7721,-73.9861,d2bce61e0e0079ebdc8c281e415e045b,8000,4538,6,...,0.804348,0.192029,0.003623,276,0.199275,0.869565,0.130435,0.0,46,0.130435


In [37]:
# add the features computed on the training dataset to the validation dataset obtained by train_test_split
X_test = X_test.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')

with pd.option_context('display.max_rows', 5, 'display.max_columns', 100):
    display(test_df)

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,created_day,num_features,num_photos,num_desc_words,24,_dishwasher_,_dryer,_pets_ok_,_photos,actual_apt,air_conditioning,backyard,balcony,basement_storage,bike_room,brownstone,building,business_center,cats_allowed,central_a,central_ac,children,childrens_playroom,common,common_outdoor_space,common_parking,common_roof_deck,common_terrace,concierge,courtyard,deck,decorative_fireplace,dining_room,dishwasher,dogs_allowed,doorman,dryer,dryer_in_building,dryer_in_unit,duplex,...,private_terrace,publicoutdoor,reduced_fee,renovated,residents_garden,residents_lounge,roof,roof_deck,roofdeck,s_kitchen_,s_playroom,sauna,shares_ok,short_term_allowed,simplex,site_garage,site_laundry,site_parking,site_parking_lot,site_super,space,speed_internet,stainless_steel_appliances,storage,sublet,subway,swimming_pool,terrace,time_doorman,unit_washer,valet,valet_parking,video_intercom,view,virtual_doorman,walk,walk_in_closet,war,washer,washer_,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill
0,1.0,1,79780be1514f645d7e6be99a3de696c5,13274,40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,24898,5,11,6,8,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.458333,0.333333,0.208333,24.0,0.750000
1,1.0,2,0,13391,40.7278,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,5492,6,24,3,3,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.000000,0.000000,0.000000,9.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,1.0,1,67ab535b820c8292ac59cfcffd8974e3,11108,40.7792,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,14244,15,16,3,3,203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.684211,0.236842,0.078947,38.0,0.394737
74658,1.0,2,be97e14c554ba6a01d26243ca5eefb82,12282,40.7145,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,24425,2,26,3,2,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.888889,0.111111,0.000000,9.0,0.111111


In [38]:
# add the features computed on the training dataset to the test_df dataset
test_df = test_df.merge(gby.reset_index(),how='left', left_on='building_id', right_on='building_id')

new_manager_ixes = test_df['bd_low_frac'].isnull()

test_df.loc[new_manager_ixes,['bd_high_frac','bd_low_frac', 'bd_medium_frac','bd_avg_interest']] = mean_values.values

with pd.option_context('display.max_rows', 5, 'display.max_columns', 200):
    display(test_df)

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,created_hour,created_day,num_features,num_photos,num_desc_words,24,_dishwasher_,_dryer,_pets_ok_,_photos,actual_apt,air_conditioning,backyard,balcony,basement_storage,bike_room,brownstone,building,business_center,cats_allowed,central_a,central_ac,children,childrens_playroom,common,common_outdoor_space,common_parking,common_roof_deck,common_terrace,concierge,courtyard,deck,decorative_fireplace,dining_room,dishwasher,dogs_allowed,doorman,dryer,dryer_in_building,dryer_in_unit,duplex,eat_in_kitchen,elevator,exclusive,exposed_brick,fireplace,fitness,fitness_center,flex,ft_doorman,full_service_garage,furnished,garage,garden,granite_kitchen,green_building,gym,gym_in_building,hardwood,hardwood_floors,health_club,hi_rise,high,high_ceiling,high_ceilings,high_speed_internet,highrise,in_super,in_superintendent,indoor_pool,laundry,laundry_in_building,laundry_in_unit,laundry_room,level,light,live,live_in_super,loft,lounge,lounge_room,lowrise,luxury_building,marble_bath,microwave,midrise,multi,new_construction,newly_renovated,no_fee,no_pets,outdoor,outdoor_areas,outdoor_entertainment_space,outdoor_space,parking,parking_space,patio,pet_friendly,pets_on_approval,playroom,pool,post,post_war,pre,prewar,private,private_backyard,private_balcony,private_outdoor_space,private_terrace,publicoutdoor,reduced_fee,renovated,residents_garden,residents_lounge,roof,roof_deck,roofdeck,s_kitchen_,s_playroom,sauna,shares_ok,short_term_allowed,simplex,site_garage,site_laundry,site_parking,site_parking_lot,site_super,space,speed_internet,stainless_steel_appliances,storage,sublet,subway,swimming_pool,terrace,time_doorman,unit_washer,valet,valet_parking,video_intercom,view,virtual_doorman,walk,walk_in_closet,war,washer,washer_,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,1.0,1,79780be1514f645d7e6be99a3de696c5,13274,40.7185,-73.9865,b1b1852c416d78d7765d746cb1b8921f,2950,24898,5,11,6,8,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.458333,0.333333,0.208333,24.0,0.750000,0.000000,1.000000,0.000000,3.0,1.000000
1,1.0,2,0,13391,40.7278,-74.0000,d0b5648017832b2427eeb9956d966a14,2850,5492,6,24,3,3,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.000000,0.000000,0.000000,9.0,0.000000,0.915762,0.060705,0.023534,8286.0,0.107772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74657,1.0,1,67ab535b820c8292ac59cfcffd8974e3,11108,40.7792,-73.9484,614d589dc9b706652ebc2f86d31e19ba,1775,14244,15,16,3,3,203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.684211,0.236842,0.078947,38.0,0.394737,1.000000,0.000000,0.000000,3.0,0.000000
74658,1.0,2,be97e14c554ba6a01d26243ca5eefb82,12282,40.7145,-73.9383,62c2e57109eb335cad8b03f84975e3d1,2850,24425,2,26,3,2,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.888889,0.111111,0.000000,9.0,0.111111,0.800000,0.200000,0.000000,5.0,0.200000


In [39]:
test_df.drop(['manager_id','building_id'], axis=1, inplace=True)
X_train.drop(['manager_id','building_id'], axis=1, inplace=True)
X_test.drop(['manager_id','building_id'], axis=1, inplace=True)
test_df.tail()

Unnamed: 0,bathrooms,bedrooms,display_address,latitude,longitude,price,street_address,created_hour,created_day,num_features,...,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
74654,1.0,2,7182,40.7267,-73.8569,2000,24702,1,16,3,...,0.733333,0.166667,0.1,30.0,0.366667,0.915762,0.060705,0.023534,8286.0,0.107772
74655,1.0,1,8784,40.7061,-74.0111,3649,10248,18,6,0,...,0.987805,0.009756,0.002439,410.0,0.014634,0.911765,0.088235,0.0,68.0,0.088235
74656,1.0,0,15356,40.7661,-73.9859,2195,15281,3,16,6,...,0.525641,0.320513,0.153846,78.0,0.628205,0.75,0.166667,0.083333,12.0,0.333333
74657,1.0,1,11108,40.7792,-73.9484,1775,14244,15,16,3,...,0.684211,0.236842,0.078947,38.0,0.394737,1.0,0.0,0.0,3.0,0.0
74658,1.0,2,12282,40.7145,-73.9383,2850,24425,2,26,3,...,0.888889,0.111111,0.0,9.0,0.111111,0.8,0.2,0.0,5.0,0.2


In [40]:
X_train.tail()

Unnamed: 0,bathrooms,bedrooms,display_address,latitude,longitude,price,street_address,created_hour,created_day,num_features,...,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
37009,1.0,2,14991,40.8002,-73.9539,3000,3076,3,10,8,...,0.857143,0.142857,0.0,21,0.142857,0.915762,0.060705,0.023534,8286,0.107772
37010,1.0,1,14349,40.7687,-73.9867,2600,17449,4,18,3,...,0.726027,0.246575,0.027397,73,0.30137,0.894737,0.052632,0.052632,19,0.157895
37011,1.0,1,11488,40.7074,-74.007,3640,6270,5,13,0,...,0.686538,0.245559,0.067904,2533,0.381366,0.915762,0.060705,0.023534,8286,0.107772
37012,1.0,2,12372,40.7198,-73.9964,3600,5022,1,6,3,...,1.0,0.0,0.0,56,0.0,0.915762,0.060705,0.023534,8286,0.107772
37013,2.0,3,11070,40.7742,-73.9475,5100,19454,7,6,3,...,0.987805,0.009756,0.002439,410,0.014634,0.904762,0.047619,0.047619,21,0.142857


In [41]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', 200):
    display(X_train.head(20))

Unnamed: 0,bathrooms,bedrooms,display_address,latitude,longitude,price,street_address,created_hour,created_day,num_features,num_photos,num_desc_words,24,_dishwasher_,_dryer,_pets_ok_,_photos,actual_apt,air_conditioning,backyard,balcony,basement_storage,bike_room,brownstone,building,business_center,cats_allowed,central_a,central_ac,children,childrens_playroom,common,common_outdoor_space,common_parking,common_roof_deck,common_terrace,concierge,courtyard,deck,decorative_fireplace,dining_room,dishwasher,dogs_allowed,doorman,dryer,dryer_in_building,dryer_in_unit,duplex,eat_in_kitchen,elevator,exclusive,exposed_brick,fireplace,fitness,fitness_center,flex,ft_doorman,full_service_garage,furnished,garage,garden,granite_kitchen,green_building,gym,gym_in_building,hardwood,hardwood_floors,health_club,hi_rise,high,high_ceiling,high_ceilings,high_speed_internet,highrise,in_super,in_superintendent,indoor_pool,laundry,laundry_in_building,laundry_in_unit,laundry_room,level,light,live,live_in_super,loft,lounge,lounge_room,lowrise,luxury_building,marble_bath,microwave,midrise,multi,new_construction,newly_renovated,no_fee,no_pets,outdoor,outdoor_areas,outdoor_entertainment_space,outdoor_space,parking,parking_space,patio,pet_friendly,pets_on_approval,playroom,pool,post,post_war,pre,prewar,private,private_backyard,private_balcony,private_outdoor_space,private_terrace,publicoutdoor,reduced_fee,renovated,residents_garden,residents_lounge,roof,roof_deck,roofdeck,s_kitchen_,s_playroom,sauna,shares_ok,short_term_allowed,simplex,site_garage,site_laundry,site_parking,site_parking_lot,site_super,space,speed_internet,stainless_steel_appliances,storage,sublet,subway,swimming_pool,terrace,time_doorman,unit_washer,valet,valet_parking,video_intercom,view,virtual_doorman,walk,walk_in_closet,war,washer,washer_,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,man_low_frac,man_medium_frac,man_high_frac,man_count,manager_skill,bd_low_frac,bd_medium_frac,bd_high_frac,bd_count,bd_avg_interest
0,1.0,1,11295,40.8152,-73.9471,2150,10235,2,17,4,4,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.727273,0.181818,0.090909,66,0.363636,0.000000,1.000000,0.000000,4,1.000000
1,1.0,2,12929,40.7216,-73.9927,3250,23531,2,17,1,5,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.661538,0.284615,0.053846,130,0.392308,0.812500,0.187500,0.000000,16,0.187500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,1.0,1,10849,40.7521,-73.9743,4975,3783,1,19,5,9,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.611111,0.250000,0.138889,36,0.527778,0.915762,0.060705,0.023534,8286,0.107772
19,2.0,2,11147,40.7842,-73.9480,4345,7033,14,12,4,4,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.943396,0.037736,0.018868,53,0.075472,0.657895,0.315789,0.026316,38,0.368421


In [42]:
test_df.drop(['street_address'], axis=1, inplace=True)
X_train.drop(['street_address'], axis=1, inplace=True)
X_test.drop(['street_address'], axis=1, inplace=True)

In [43]:
int_lev_dict = {'low': 2, 'medium': 1, 'high': 0}

y_train = y_train.apply(lambda x: int_lev_dict[x])

y_test = y_test.apply(lambda x: int_lev_dict[x])

### XgBoost and parameter tuning:
Parameter optimization using GridSearchCV and RandomizedSearchCV was left out from this notebook,
because it takes a very long time. I implemented it in another notebook, which I will provide you with.

In [44]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier      
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  
from time import perf_counter as timer

import matplotlib.pylab as plt
%matplotlib inline

In [45]:
def modelfit(params, X_train, y_train, X_test, y_test, early_stopping_rounds = 50, cv_folds=5, num_boost_round=1000):
    
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    
    xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
    
    eval_set =[(X_test, y_test)]
    
    model = xgb.train(params,xgtrain,num_boost_round=num_boost_round,
                      evals=[(xgtrain,'train'),(xgtest,'test')], early_stopping_rounds=early_stopping_rounds)
    
    train_preds = model.predict(xgtrain)
    test_preds = model.predict(xgtest)
    
    print('MODEL REPORT: \n')
    print("Log loss on train set : %.6g" % log_loss(y_train, train_preds))
    print("Log loss on test set : %.6g" % log_loss(y_test, test_preds))

In [46]:
params_1 = {
 'eta':0.1,
 'max_depth':6,
 'min_child_weight':1,
 'eval_metric':'mlogloss',
 'colsample_bytree':0.7,
 'objective':'multi:softprob',
 'num_class':3,
 'subsample':0.7,
 'scale_pos_weight':1,
 'seed':15}

modelfit(params_1, X_train, y_train, X_test, y_test)

[0]	train-mlogloss:1.01107	test-mlogloss:1.01211
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:0.938588	test-mlogloss:0.9405
[2]	train-mlogloss:0.877813	test-mlogloss:0.880336
[3]	train-mlogloss:0.826042	test-mlogloss:0.82919
[4]	train-mlogloss:0.779046	test-mlogloss:0.782909
[5]	train-mlogloss:0.738707	test-mlogloss:0.743376
[6]	train-mlogloss:0.703184	test-mlogloss:0.708524
[7]	train-mlogloss:0.672337	test-mlogloss:0.678371
[8]	train-mlogloss:0.644677	test-mlogloss:0.651422
[9]	train-mlogloss:0.620473	test-mlogloss:0.627779
[10]	train-mlogloss:0.598709	test-mlogloss:0.606495
[11]	train-mlogloss:0.580433	test-mlogloss:0.588736
[12]	train-mlogloss:0.563558	test-mlogloss:0.572448
[13]	train-mlogloss:0.547951	test-mlogloss:0.5573
[14]	train-mlogloss:0.533909	test-mlogloss:0.543832
[15]	train-mlogloss:0.521934	test-mlogloss:0.532272
[16]	train-mlogloss:0.511082	test-

[155]	train-mlogloss:0.320146	test-mlogloss:0.400602
[156]	train-mlogloss:0.319415	test-mlogloss:0.400739
[157]	train-mlogloss:0.318734	test-mlogloss:0.400761
[158]	train-mlogloss:0.31819	test-mlogloss:0.400746
[159]	train-mlogloss:0.317638	test-mlogloss:0.400682
[160]	train-mlogloss:0.316926	test-mlogloss:0.400734
[161]	train-mlogloss:0.316428	test-mlogloss:0.400819
[162]	train-mlogloss:0.315561	test-mlogloss:0.400995
[163]	train-mlogloss:0.31507	test-mlogloss:0.400962
[164]	train-mlogloss:0.314619	test-mlogloss:0.400956
[165]	train-mlogloss:0.313808	test-mlogloss:0.401015
[166]	train-mlogloss:0.313452	test-mlogloss:0.401023
[167]	train-mlogloss:0.312926	test-mlogloss:0.401099
[168]	train-mlogloss:0.312405	test-mlogloss:0.401127
[169]	train-mlogloss:0.31197	test-mlogloss:0.401176
[170]	train-mlogloss:0.311398	test-mlogloss:0.40121
[171]	train-mlogloss:0.310893	test-mlogloss:0.401216
[172]	train-mlogloss:0.31018	test-mlogloss:0.401249
[173]	train-mlogloss:0.30971	test-mlogloss:0.401286

 param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val

### Final training and predicting:

#### Recombining the splitted datasets, so they can be used for final training:

In [47]:
xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)

xgtest = xgb.DMatrix(test_df.values)
    
xgeval = xgb.DMatrix(X_test.values, label=y_test.values)

In [48]:
model = xgb.train(params_1,xgtrain,num_boost_round=5000,
                    evals=[(xgtrain,'train'),(xgeval,'test')], early_stopping_rounds=50)

preds = model.predict(xgtest)

[0]	train-mlogloss:1.01107	test-mlogloss:1.01211
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:0.938588	test-mlogloss:0.9405
[2]	train-mlogloss:0.877813	test-mlogloss:0.880336
[3]	train-mlogloss:0.826042	test-mlogloss:0.82919
[4]	train-mlogloss:0.779046	test-mlogloss:0.782909
[5]	train-mlogloss:0.738707	test-mlogloss:0.743376
[6]	train-mlogloss:0.703184	test-mlogloss:0.708524
[7]	train-mlogloss:0.672337	test-mlogloss:0.678371
[8]	train-mlogloss:0.644677	test-mlogloss:0.651422
[9]	train-mlogloss:0.620473	test-mlogloss:0.627779
[10]	train-mlogloss:0.598709	test-mlogloss:0.606495
[11]	train-mlogloss:0.580433	test-mlogloss:0.588736
[12]	train-mlogloss:0.563558	test-mlogloss:0.572448
[13]	train-mlogloss:0.547951	test-mlogloss:0.5573
[14]	train-mlogloss:0.533909	test-mlogloss:0.543832
[15]	train-mlogloss:0.521934	test-mlogloss:0.532272
[16]	train-mlogloss:0.511082	test-

[155]	train-mlogloss:0.320146	test-mlogloss:0.400602
[156]	train-mlogloss:0.319415	test-mlogloss:0.400739
[157]	train-mlogloss:0.318734	test-mlogloss:0.400761
[158]	train-mlogloss:0.31819	test-mlogloss:0.400746
[159]	train-mlogloss:0.317638	test-mlogloss:0.400682
[160]	train-mlogloss:0.316926	test-mlogloss:0.400734
[161]	train-mlogloss:0.316428	test-mlogloss:0.400819
[162]	train-mlogloss:0.315561	test-mlogloss:0.400995
[163]	train-mlogloss:0.31507	test-mlogloss:0.400962
[164]	train-mlogloss:0.314619	test-mlogloss:0.400956
[165]	train-mlogloss:0.313808	test-mlogloss:0.401015
[166]	train-mlogloss:0.313452	test-mlogloss:0.401023
[167]	train-mlogloss:0.312926	test-mlogloss:0.401099
[168]	train-mlogloss:0.312405	test-mlogloss:0.401127
[169]	train-mlogloss:0.31197	test-mlogloss:0.401176
[170]	train-mlogloss:0.311398	test-mlogloss:0.40121
[171]	train-mlogloss:0.310893	test-mlogloss:0.401216
[172]	train-mlogloss:0.31018	test-mlogloss:0.401249
[173]	train-mlogloss:0.30971	test-mlogloss:0.401286

In [49]:
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]

out_df["listing_id"] = test_df_id.values

out_df = out_df[['listing_id', 'high', 'medium', 'low']]

out_df.head()
out_df.shape

(74659, 4)

In [50]:
out_df.to_csv("xgb_starter2_2.csv", index=False)