In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 3
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = 321
    num_rounds = 1000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
train_df = pd.read_json('new_train.json.zip')
test_df = pd.read_json('new_test.json.zip')

In [4]:
# Initial Model
feature_cols = ['bedrooms','bathrooms','price', 'latitude', 'mean_des_tdidf', 
                'length_description', 'created_hour', 'closest_station', 'closest_hospital', 'mean_feature_tdidf', 
                'created_day','photos_num']

X = train_df[feature_cols]
y = train_df['interest_level']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

model = XGBClassifier()
model.fit(X_train, y_train)

score = cross_val_score(model, X, y, cv=5)
print('Training accuracy :', np.mean(score))

predictions = model.predict(X_valid)
test_accuracy = accuracy_score(predictions, y_valid)
print('Test accuracy :', test_accuracy)

Training accuracy : 0.7121103742279631
Test accuracy : 0.7123065486247212


In [17]:
X_test = test_df[feature_cols]

model = model.fit(X,y)

y_pred = model.predict_proba(X_test)

submission = pd.DataFrame({
    "listing_id": test_df["listing_id"],
    "high": y_pred[:,0],
    "medium":y_pred[:,1],
    "low":y_pred[:,2]
})

titles_columns=["listing_id","high","medium","low"]
submission=submission.reindex(columns=titles_columns)
submission.to_csv('initial_xgb_submission.csv', index=False)

In [None]:
feature_cols = ['bedrooms','bathrooms','price', 'latitude', 'mean_des_tdidf', 
                'length_description', 'created_hour', 'closest_station', 'closest_hospital', 'mean_feature_tdidf', 
                'created_day','photos_num']

x = train_df[feature_cols]
test_X = test_df[feature_cols]

X = np.array(x)
targetMapping = {'high':0, 'medium':1, 'low':2}
y = np.array(train_df['interest_level'].apply(lambda x: targetMapping[x]))


kf = KFold(n_splits=5, shuffle = False)
cv_scores = []

for train_index, test_index in kf.split(X):
        X_train, X_validation = X[train_index], X[test_index]
        y_train, y_validation = y[train_index], y[test_index]
        
        preds, model = runXGB(X_train, y_train, X_validation, y_validation)
        
        cv_scores.append(log_loss(y_validation, preds))  
        

[0]	train-mlogloss:1.04147	test-mlogloss:1.04032
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.992993	test-mlogloss:0.991126
[2]	train-mlogloss:0.952901	test-mlogloss:0.951616
[3]	train-mlogloss:0.916869	test-mlogloss:0.915978
[4]	train-mlogloss:0.885734	test-mlogloss:0.885499
[5]	train-mlogloss:0.857957	test-mlogloss:0.85747
[6]	train-mlogloss:0.834557	test-mlogloss:0.833948
[7]	train-mlogloss:0.814066	test-mlogloss:0.813821
[8]	train-mlogloss:0.79604	test-mlogloss:0.795767
[9]	train-mlogloss:0.778951	test-mlogloss:0.779025
[10]	train-mlogloss:0.763582	test-mlogloss:0.763103
[11]	train-mlogloss:0.750153	test-mlogloss:0.749712
[12]	train-mlogloss:0.739777	test-mlogloss:0.739337
[13]	train-mlogloss:0.729841	test-mlogloss:0.729395
[14]	train-mlogloss:0.720952	test-mlogloss:0.720844
[15]	train-mlogloss:0.713401	test-mlogloss:0.713761
[16]	train-mlogloss:0.705212	te

[155]	train-mlogloss:0.517435	test-mlogloss:0.588486
[156]	train-mlogloss:0.516806	test-mlogloss:0.588429
[157]	train-mlogloss:0.516276	test-mlogloss:0.588354
[158]	train-mlogloss:0.51574	test-mlogloss:0.588384
[159]	train-mlogloss:0.515238	test-mlogloss:0.588317
[160]	train-mlogloss:0.51468	test-mlogloss:0.588264
[161]	train-mlogloss:0.514312	test-mlogloss:0.588157
[162]	train-mlogloss:0.513581	test-mlogloss:0.588147
[163]	train-mlogloss:0.513007	test-mlogloss:0.588023
[164]	train-mlogloss:0.512586	test-mlogloss:0.587975
[165]	train-mlogloss:0.511927	test-mlogloss:0.587853
[166]	train-mlogloss:0.51136	test-mlogloss:0.58779
[167]	train-mlogloss:0.510954	test-mlogloss:0.587811
[168]	train-mlogloss:0.510357	test-mlogloss:0.587729
[169]	train-mlogloss:0.509712	test-mlogloss:0.587604
[170]	train-mlogloss:0.509412	test-mlogloss:0.587573
[171]	train-mlogloss:0.509039	test-mlogloss:0.587539
[172]	train-mlogloss:0.508459	test-mlogloss:0.587457
[173]	train-mlogloss:0.507949	test-mlogloss:0.5873

[32]	train-mlogloss:0.637054	test-mlogloss:0.657369
[33]	train-mlogloss:0.634646	test-mlogloss:0.655561
[34]	train-mlogloss:0.631773	test-mlogloss:0.653373
[35]	train-mlogloss:0.630088	test-mlogloss:0.652456
[36]	train-mlogloss:0.627352	test-mlogloss:0.650159
[37]	train-mlogloss:0.625512	test-mlogloss:0.64906
[38]	train-mlogloss:0.624081	test-mlogloss:0.648289
[39]	train-mlogloss:0.62172	test-mlogloss:0.646696
[40]	train-mlogloss:0.618873	test-mlogloss:0.644632
[41]	train-mlogloss:0.616899	test-mlogloss:0.643145
[42]	train-mlogloss:0.615298	test-mlogloss:0.642194
[43]	train-mlogloss:0.612569	test-mlogloss:0.640161
[44]	train-mlogloss:0.610516	test-mlogloss:0.638924
[45]	train-mlogloss:0.608525	test-mlogloss:0.637427
[46]	train-mlogloss:0.606364	test-mlogloss:0.63595
[47]	train-mlogloss:0.605046	test-mlogloss:0.635231
[48]	train-mlogloss:0.60322	test-mlogloss:0.634083
[49]	train-mlogloss:0.601269	test-mlogloss:0.632686
[50]	train-mlogloss:0.599998	test-mlogloss:0.631968
[51]	train-mlogl

[189]	train-mlogloss:0.495331	test-mlogloss:0.599124
[190]	train-mlogloss:0.494815	test-mlogloss:0.599081
[191]	train-mlogloss:0.494427	test-mlogloss:0.59908
[192]	train-mlogloss:0.494053	test-mlogloss:0.599067
[193]	train-mlogloss:0.493567	test-mlogloss:0.599037
[194]	train-mlogloss:0.493214	test-mlogloss:0.598916
[195]	train-mlogloss:0.492671	test-mlogloss:0.598823
[196]	train-mlogloss:0.492331	test-mlogloss:0.598793
[197]	train-mlogloss:0.491877	test-mlogloss:0.59871
[198]	train-mlogloss:0.491464	test-mlogloss:0.598538
[199]	train-mlogloss:0.490933	test-mlogloss:0.598426
[200]	train-mlogloss:0.49033	test-mlogloss:0.598304
[201]	train-mlogloss:0.489742	test-mlogloss:0.598188
[202]	train-mlogloss:0.489244	test-mlogloss:0.598213
[203]	train-mlogloss:0.488706	test-mlogloss:0.598152
[204]	train-mlogloss:0.488269	test-mlogloss:0.59814
[205]	train-mlogloss:0.487852	test-mlogloss:0.598057
[206]	train-mlogloss:0.48734	test-mlogloss:0.598067
[207]	train-mlogloss:0.487049	test-mlogloss:0.59812

[345]	train-mlogloss:0.430449	test-mlogloss:0.594799
[346]	train-mlogloss:0.430241	test-mlogloss:0.594773
[347]	train-mlogloss:0.429993	test-mlogloss:0.594724
[348]	train-mlogloss:0.429708	test-mlogloss:0.594745
[349]	train-mlogloss:0.429308	test-mlogloss:0.594813
[350]	train-mlogloss:0.428888	test-mlogloss:0.594914
[351]	train-mlogloss:0.428654	test-mlogloss:0.594802
[352]	train-mlogloss:0.428333	test-mlogloss:0.59475
[353]	train-mlogloss:0.427996	test-mlogloss:0.594758
[354]	train-mlogloss:0.427658	test-mlogloss:0.594728
[355]	train-mlogloss:0.427226	test-mlogloss:0.594799
[356]	train-mlogloss:0.426913	test-mlogloss:0.594719
[357]	train-mlogloss:0.42675	test-mlogloss:0.594736
[358]	train-mlogloss:0.426306	test-mlogloss:0.594695
[359]	train-mlogloss:0.425891	test-mlogloss:0.594681
[360]	train-mlogloss:0.42546	test-mlogloss:0.594709
[361]	train-mlogloss:0.425002	test-mlogloss:0.594762
[362]	train-mlogloss:0.424651	test-mlogloss:0.594705
[363]	train-mlogloss:0.424295	test-mlogloss:0.594

[105]	train-mlogloss:0.543673	test-mlogloss:0.613563
[106]	train-mlogloss:0.542955	test-mlogloss:0.61339
[107]	train-mlogloss:0.542211	test-mlogloss:0.613316
[108]	train-mlogloss:0.541821	test-mlogloss:0.613303
[109]	train-mlogloss:0.541136	test-mlogloss:0.61335
[110]	train-mlogloss:0.540507	test-mlogloss:0.613238
[111]	train-mlogloss:0.539959	test-mlogloss:0.613224
[112]	train-mlogloss:0.53909	test-mlogloss:0.612953
[113]	train-mlogloss:0.538268	test-mlogloss:0.612814
[114]	train-mlogloss:0.537476	test-mlogloss:0.612527
[115]	train-mlogloss:0.536683	test-mlogloss:0.612365
[116]	train-mlogloss:0.535971	test-mlogloss:0.612279
[117]	train-mlogloss:0.53514	test-mlogloss:0.612128
[118]	train-mlogloss:0.534247	test-mlogloss:0.611963
[119]	train-mlogloss:0.533457	test-mlogloss:0.611765
[120]	train-mlogloss:0.532567	test-mlogloss:0.611603
[121]	train-mlogloss:0.532132	test-mlogloss:0.611527
[122]	train-mlogloss:0.531479	test-mlogloss:0.611372
[123]	train-mlogloss:0.530892	test-mlogloss:0.6113

[261]	train-mlogloss:0.460071	test-mlogloss:0.606377
[262]	train-mlogloss:0.459674	test-mlogloss:0.606405
[263]	train-mlogloss:0.459255	test-mlogloss:0.60641
[264]	train-mlogloss:0.458857	test-mlogloss:0.606493
[265]	train-mlogloss:0.458384	test-mlogloss:0.606386
[266]	train-mlogloss:0.45802	test-mlogloss:0.606406
[267]	train-mlogloss:0.457546	test-mlogloss:0.606342
[268]	train-mlogloss:0.457331	test-mlogloss:0.606413
Stopping. Best iteration:
[248]	train-mlogloss:0.465634	test-mlogloss:0.605902

[0]	train-mlogloss:1.04435	test-mlogloss:1.04579
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.995972	test-mlogloss:0.999301
[2]	train-mlogloss:0.956337	test-mlogloss:0.961171
[3]	train-mlogloss:0.922433	test-mlogloss:0.928498
[4]	train-mlogloss:0.89007	test-mlogloss:0.897859
[5]	train-mlogloss:0.861557	test-mlogloss:0.870674
[6]	train-mlogloss:0.837342	test-mlogloss:0.

[145]	train-mlogloss:0.513568	test-mlogloss:0.619755
[146]	train-mlogloss:0.513262	test-mlogloss:0.619728
[147]	train-mlogloss:0.512609	test-mlogloss:0.619662
[148]	train-mlogloss:0.512106	test-mlogloss:0.619681
[149]	train-mlogloss:0.511637	test-mlogloss:0.619649
[150]	train-mlogloss:0.511085	test-mlogloss:0.619603
[151]	train-mlogloss:0.510324	test-mlogloss:0.619536
[152]	train-mlogloss:0.509805	test-mlogloss:0.619462
[153]	train-mlogloss:0.509266	test-mlogloss:0.619345
[154]	train-mlogloss:0.508928	test-mlogloss:0.619409
[155]	train-mlogloss:0.508608	test-mlogloss:0.619405
[156]	train-mlogloss:0.508048	test-mlogloss:0.619346
[157]	train-mlogloss:0.507453	test-mlogloss:0.61929
[158]	train-mlogloss:0.506887	test-mlogloss:0.619271
[159]	train-mlogloss:0.506178	test-mlogloss:0.619296
[160]	train-mlogloss:0.505729	test-mlogloss:0.619274
[161]	train-mlogloss:0.505319	test-mlogloss:0.619299
[162]	train-mlogloss:0.504931	test-mlogloss:0.619258
[163]	train-mlogloss:0.504234	test-mlogloss:0.6

[37]	train-mlogloss:0.624663	test-mlogloss:0.652086
[38]	train-mlogloss:0.623163	test-mlogloss:0.651059
[39]	train-mlogloss:0.620759	test-mlogloss:0.64953
[40]	train-mlogloss:0.617988	test-mlogloss:0.647469
[41]	train-mlogloss:0.61587	test-mlogloss:0.646107
[42]	train-mlogloss:0.614361	test-mlogloss:0.645219
[43]	train-mlogloss:0.61168	test-mlogloss:0.643225
[44]	train-mlogloss:0.609825	test-mlogloss:0.642194
[45]	train-mlogloss:0.607618	test-mlogloss:0.640878
[46]	train-mlogloss:0.605522	test-mlogloss:0.639342
[47]	train-mlogloss:0.603946	test-mlogloss:0.63864
[48]	train-mlogloss:0.60223	test-mlogloss:0.637412
[49]	train-mlogloss:0.600319	test-mlogloss:0.636192
[50]	train-mlogloss:0.599274	test-mlogloss:0.635603
[51]	train-mlogloss:0.597835	test-mlogloss:0.634757
[52]	train-mlogloss:0.596015	test-mlogloss:0.633655
[53]	train-mlogloss:0.594509	test-mlogloss:0.633088
[54]	train-mlogloss:0.593133	test-mlogloss:0.632502
[55]	train-mlogloss:0.592103	test-mlogloss:0.631924
[56]	train-mloglo

In [25]:
preds, model = runXGB(x, y, test_X, num_rounds=400)
submission = pd.DataFrame({
        "listing_id": test_df["listing_id"],
        "high": preds[:,0],
        "medium":preds[:,1],
        "low":preds[:,2]
    })

titles_columns=["listing_id","high","medium","low"]
submission=submission.reindex(columns=titles_columns)
submission.to_csv('improved_xgb_submission.csv', index=False)