In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 3
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = 321
    num_rounds = 1000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
train_df = pd.read_json('new_train.json.zip')
test_df = pd.read_json('new_test.json.zip')

In [4]:
# Initial Model
feature_cols = ['bedrooms','bathrooms','price', 'latitude', 'mean_des_tdidf', 
                'length_description', 'created_hour', 'closest_station', 'closest_hospital', 'mean_feature_tdidf', 
                'created_day','photos_num']

X = train_df[feature_cols]
y = train_df['interest_level']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

model = XGBClassifier()
model.fit(X_train, y_train)

score = cross_val_score(model, X, y, cv=5)
print('Training accuracy :', np.mean(score))

predictions = model.predict(X_valid)
test_accuracy = accuracy_score(predictions, y_valid)
print('Test accuracy :', test_accuracy)

Training accuracy : 0.7121103742279631
Test accuracy : 0.7087247415016558


In [6]:
X_test = test_df[feature_cols]

model = model.fit(X,y)

y_pred = model.predict_proba(X_test)

submission = pd.DataFrame({
    "listing_id": test_df["listing_id"],
    "high": y_pred[:,0],
    "medium":y_pred[:,1],
    "low":y_pred[:,2]
})

titles_columns=["listing_id","high","medium","low"]
submission=submission.reindex(columns=titles_columns)
submission.to_csv('initial_xgb_submission.csv', index=False)

In [7]:
feature_cols = ['bedrooms','bathrooms','price', 'latitude', 'mean_des_tdidf', 
                'length_description', 'created_hour', 'closest_station', 'closest_hospital', 'mean_feature_tdidf', 
                'created_day','photos_num']

x = train_df[feature_cols]
test_X = test_df[feature_cols]

X = np.array(x)
targetMapping = {'high':0, 'medium':1, 'low':2}
y = np.array(train_df['interest_level'].apply(lambda x: targetMapping[x]))


kf = KFold(n_splits=5, shuffle = False)
cv_scores = []

for train_index, test_index in kf.split(X):
        X_train, X_validation = X[train_index], X[test_index]
        y_train, y_validation = y[train_index], y[test_index]
        
        preds, model = runXGB(X_train, y_train, X_validation, y_validation)
        
        cv_scores.append(log_loss(y_validation, preds))
        break
        

[0]	train-mlogloss:1.04147	test-mlogloss:1.04032
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.992993	test-mlogloss:0.991126
[2]	train-mlogloss:0.952901	test-mlogloss:0.951616
[3]	train-mlogloss:0.916869	test-mlogloss:0.915978
[4]	train-mlogloss:0.885734	test-mlogloss:0.885499
[5]	train-mlogloss:0.857957	test-mlogloss:0.85747
[6]	train-mlogloss:0.834557	test-mlogloss:0.833948
[7]	train-mlogloss:0.814066	test-mlogloss:0.813821
[8]	train-mlogloss:0.79604	test-mlogloss:0.795767
[9]	train-mlogloss:0.778951	test-mlogloss:0.779025
[10]	train-mlogloss:0.763582	test-mlogloss:0.763103
[11]	train-mlogloss:0.750153	test-mlogloss:0.749712
[12]	train-mlogloss:0.739777	test-mlogloss:0.739337
[13]	train-mlogloss:0.729841	test-mlogloss:0.729395
[14]	train-mlogloss:0.720952	test-mlogloss:0.720844
[15]	train-mlogloss:0.713401	test-mlogloss:0.713761
[16]	train-mlogloss:0.705212	te

[155]	train-mlogloss:0.517435	test-mlogloss:0.588486
[156]	train-mlogloss:0.516806	test-mlogloss:0.588429
[157]	train-mlogloss:0.516276	test-mlogloss:0.588354
[158]	train-mlogloss:0.51574	test-mlogloss:0.588384
[159]	train-mlogloss:0.515238	test-mlogloss:0.588317
[160]	train-mlogloss:0.51468	test-mlogloss:0.588264
[161]	train-mlogloss:0.514312	test-mlogloss:0.588157
[162]	train-mlogloss:0.513581	test-mlogloss:0.588147
[163]	train-mlogloss:0.513007	test-mlogloss:0.588023
[164]	train-mlogloss:0.512586	test-mlogloss:0.587975
[165]	train-mlogloss:0.511927	test-mlogloss:0.587853
[166]	train-mlogloss:0.51136	test-mlogloss:0.58779
[167]	train-mlogloss:0.510954	test-mlogloss:0.587811
[168]	train-mlogloss:0.510357	test-mlogloss:0.587729
[169]	train-mlogloss:0.509712	test-mlogloss:0.587604
[170]	train-mlogloss:0.509412	test-mlogloss:0.587573
[171]	train-mlogloss:0.509039	test-mlogloss:0.587539
[172]	train-mlogloss:0.508459	test-mlogloss:0.587457
[173]	train-mlogloss:0.507949	test-mlogloss:0.5873

In [9]:
preds, model = runXGB(x, y, test_X,)
submission = pd.DataFrame({
        "listing_id": test_df["listing_id"],
        "high": preds[:,0],
        "medium":preds[:,1],
        "low":preds[:,2]
    })

titles_columns=["listing_id","high","medium","low"]
submission=submission.reindex(columns=titles_columns)
submission.to_csv('improved_xgb_submission.csv', index=False)