In [1]:
import os,sys
import re
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from mochi import CVstatistics

In [28]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None,      
           seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,     
           max_depth = 6,cv_dict = None,verbose_eval=True,
          cb=0.7,sb=0.7):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = cb
    param['colsample_bytree'] = sb
    param['seed'] = seed_val
    param['nthread'] = 4
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,        
            early_stopping_rounds=early_stop,evals_result = cv_dict,verbose_eval = verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
#the data path list
data_path = '/home/raku/kaggleData/2sigma/'
#model_list = ['et2000','knn4','knn8','knn16','knn32','lr4','xgb142','rf2000']
#new_model_list=['et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm']
model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','ann','lgbm'
           ,'lr4','loglr','et2000','rf2000','lgb145']

In [4]:
meta_train_temp_list=[]
meta_test_temp_list=[]
for model in model_list:
    dp = os.listdir(data_path+model+'/')
    fold_out_file = []
    for filename in dp:
        if re.match('\S+-5fold-out-\d.pickle',filename)!=None:
            fold_out_file.append(filename)
        elif re.match('\S+-bulk-out.json',filename)!=None:
            bulk_out_file = filename
            
    fold_out_file=sorted(fold_out_file)
    #print fold_out_file
    #print bulk_out_file
    #load the pickles and combine into meta_train
    #load the json bulk out into meta_test
    pickle_data = []
    json_data = pd.read_json(data_path+model+'/'+bulk_out_file)
    temp_meta_test=pd.DataFrame(json_data['high'])
    temp_meta_test.columns=[model+'_high']
    temp_meta_test[model+'_medium']=json_data['medium']
    temp_meta_test[model+'_low']=json_data['low']
    if 'listing_id' in json_data.columns:
        test_listing=json_data['listing_id']
    
    for pickle_file in fold_out_file:
        pickl_file = data_path+model+'/'+pickle_file
        fileObject = open(pickl_file,'r') 
        pickle_data.append(pd.DataFrame(pickle.load(fileObject)))   
        fileObject.close()
    temp_meta_train=pd.concat(pickle_data)
    temp_meta_train.columns=[model+'_high',model+'_medium',model+'_low']
    temp_meta_test.columns=[model+'_high',model+'_medium',model+'_low']
    meta_train_temp_list.append(temp_meta_train)
    meta_test_temp_list.append(temp_meta_test)
    
meta_train=pd.concat(meta_train_temp_list,axis=1)
meta_test=pd.concat(meta_test_temp_list,axis=1)

In [5]:
y_data = []
for pickle_file in ['y-5fold-out-0.pickle','y-5fold-out-1.pickle','y-5fold-out-2.pickle','y-5fold-out-3.pickle','y-5fold-out-4.pickle']:
    pickl_file = data_path+'/'+pickle_file
    fileObject = open(pickl_file,'r') 
    y_data.append(pd.DataFrame(pickle.load(fileObject)))   
    fileObject.close()
meta_train_y=np.array(pd.concat(y_data)[0])

In [24]:
test_listing

0        7142618
1        7210040
10       6832604
100      6830595
1000     6843709
10000    7232076
10001    6832266
10002    6931714
10003    7316357
10004    6839137
10005    6879707
10006    6866122
10007    7154141
10008    7194059
10009    6849409
1001     6880861
10010    6850837
10011    6915788
10012    6922125
10013    6816508
10014    6915345
10015    7107945
10016    6841711
10017    6832076
10018    6868588
10019    6855095
1002     6927246
10020    7185889
10021    6851285
10022    6911912
          ...   
9972     6812689
9973     6945375
9974     6838369
9975     6876359
9976     6871970
9977     6911618
9978     6879762
9979     6834041
998      6835259
9980     6937182
9981     6936112
9982     6859557
9983     7127420
9984     6889376
9985     6874327
9986     6870171
9987     6942777
9988     6858105
9989     6933079
999      6878793
9990     6850266
9991     6924419
9992     6868652
9993     6883509
9994     6832931
9995     6915633
9996     7100256
9997     69455

In [6]:
highs=meta_train.filter(like='_high')
coeff_matrix={}
for features in highs.columns:
    temp_matrix={}
    for another_feat in highs.columns:
        temp_matrix[another_feat]=pearsonr(highs[features],highs[another_feat])[0]
    coeff_matrix[features]=temp_matrix
high_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
high_coeff_df.max()

ann_high            0.933713
et1000mf140_high    0.964709
et2000_high         0.944836
knn16_high          0.927173
knn32_high          0.927173
knn4_high           0.846329
knn8_high           0.890040
lgb145_high         0.996205
lgbm_high           0.996205
loglrC03_high       0.998727
loglr_high          0.998727
lr4_high            0.998918
lrl1C1_high         0.996338
lrl2C3_high         0.998918
rf1000mf70_high     0.973423
rf2000_high         0.973423
xgb142_high         0.978066
dtype: float64

In [7]:
high_coeff_df

Unnamed: 0,ann_high,et1000mf140_high,et2000_high,knn16_high,knn32_high,knn4_high,knn8_high,lgb145_high,lgbm_high,loglrC03_high,loglr_high,lr4_high,lrl1C1_high,lrl2C3_high,rf1000mf70_high,rf2000_high,xgb142_high
ann_high,-1.0,0.783801,0.744929,0.723163,0.768169,0.562121,0.65265,0.801473,0.797217,0.933713,0.933024,0.930097,0.9308,0.923855,0.805705,0.81009,0.797396
et1000mf140_high,0.783801,-1.0,0.944836,0.671221,0.700832,0.556976,0.622569,0.828525,0.827946,0.748515,0.746597,0.762986,0.765281,0.762447,0.95415,0.964709,0.842248
et2000_high,0.744929,0.944836,-1.0,0.624637,0.655123,0.513843,0.576704,0.762313,0.762964,0.718713,0.716768,0.718594,0.721633,0.716591,0.882602,0.937338,0.780305
knn16_high,0.723163,0.671221,0.624637,-1.0,0.927173,0.753741,0.89004,0.638831,0.634207,0.713997,0.710163,0.70456,0.704489,0.699562,0.703869,0.713161,0.631619
knn32_high,0.768169,0.700832,0.655123,0.927173,-1.0,0.698403,0.822283,0.669413,0.663734,0.761322,0.757097,0.748948,0.749143,0.74327,0.736366,0.746184,0.660193
knn4_high,0.562121,0.556976,0.513843,0.753741,0.698403,-1.0,0.846329,0.515898,0.513133,0.552377,0.549731,0.550288,0.550636,0.547383,0.580875,0.587759,0.516235
knn8_high,0.65265,0.622569,0.576704,0.89004,0.822283,0.846329,-1.0,0.587051,0.58357,0.643003,0.639919,0.636677,0.637135,0.632795,0.652416,0.659898,0.582678
lgb145_high,0.801473,0.828525,0.762313,0.638831,0.669413,0.515898,0.587051,-1.0,0.996205,0.746534,0.745697,0.807313,0.8061,0.811686,0.879079,0.845978,0.976357
lgbm_high,0.797217,0.827946,0.762964,0.634207,0.663734,0.513133,0.58357,0.996205,-1.0,0.741386,0.740706,0.802943,0.801731,0.807414,0.877088,0.844601,0.978066
loglrC03_high,0.933713,0.748515,0.718713,0.713997,0.761322,0.552377,0.643003,0.746534,0.741386,-1.0,0.998727,0.934713,0.9359,0.923461,0.7687,0.782385,0.740111


In [26]:
mediums=meta_train.filter(like='_medium')
coeff_matrix={}
for features in mediums.columns:
    temp_matrix={}
    for another_feat in mediums.columns:
        temp_matrix[another_feat]=pearsonr(mediums[features],mediums[another_feat])[0]
    coeff_matrix[features]=temp_matrix
medium_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
medium_coeff_df.max()

et1000mf140_medium    0.953708
et2000_medium         0.936937
knn16_medium          0.934426
knn32_medium          0.934426
knn4_medium           0.849902
knn8_medium           0.894516
lgbm_medium           0.977196
loglrC03_medium       0.999508
loglr_medium          0.999508
lr4_medium            0.999656
lrl1C1_medium         0.990533
lrl2C3_medium         0.999656
nn_medium             0.946818
rf1000mf70_medium     0.971443
rf2000_medium         0.971443
xgb142_medium         0.977196
dtype: float64

In [47]:
lows=meta_train.filter(like='_low')
coeff_matrix={}
for features in lows.columns:
    temp_matrix={}
    for another_feat in lows.columns:
        temp_matrix[another_feat]=pearsonr(lows[features],lows[another_feat])[0]
    coeff_matrix[features]=temp_matrix
low_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
low_coeff_df.max()

ann_low            0.962808
et1000mf140_low    0.970532
et2000_low         0.958324
knn16_low          0.965450
knn32_low          0.965450
knn4_low           0.905308
knn8_low           0.940407
lgb145_low         0.998522
lgbm_low           0.998522
loglrC03_low       0.999602
loglr_low          0.999602
lr4_low            0.999688
lrl1C1_low         0.994035
lrl2C3_low         0.999688
rf1000mf70_low     0.980853
rf2000_low         0.980853
xgb142_low         0.990092
dtype: float64

In [48]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
for feature in meta_train:
    for model in chosen_model:
        if model in feature:
            print model,feature
            meta_features.append(feature)

knn4 knn4_high
knn4 knn4_medium
knn4 knn4_low
knn8 knn8_high
knn8 knn8_medium
knn8 knn8_low
knn16 knn16_high
knn16 knn16_medium
knn16 knn16_low
knn32 knn32_high
knn32 knn32_medium
knn32 knn32_low
xgb142 xgb142_high
xgb142 xgb142_medium
xgb142 xgb142_low
et1000mf140 et1000mf140_high
et1000mf140 et1000mf140_medium
et1000mf140 et1000mf140_low
loglrC03 loglrC03_high
loglrC03 loglrC03_medium
loglrC03 loglrC03_low
lrl2C3 lrl2C3_high
lrl2C3 lrl2C3_medium
lrl2C3 lrl2C3_low
rf1000mf70 rf1000mf70_high
rf1000mf70 rf1000mf70_medium
rf1000mf70 rf1000mf70_low
ann ann_high
ann ann_medium
ann ann_low
lgb145 lgb145_high
lgb145 lgb145_medium
lgb145 lgb145_low


In [14]:
KF=StratifiedKFold(5,shuffle=True,random_state = 66666)

In [18]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train[meta_features].iloc[dev_index,:].as_matrix(), meta_train[meta_features].iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,\
           early_stop = 20,num_rounds=780,eta = 0.1,max_depth=3,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.0206	test-mlogloss:1.02084
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.492589	test-mlogloss:0.501716
Stopping. Best iteration:
[108]	train-mlogloss:0.491308	test-mlogloss:0.501646

loss for the turn 1 is 0.501890412987
[0]	train-mlogloss:1.02084	test-mlogloss:1.02054
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.493364	test-mlogloss:0.497193
Stopping. Best iteration:
[173]	train-mlogloss:0.483334	test-mlogloss:0.496629

loss for the turn 2 is 0.496654262807
[0]	train-mlogloss:1.02044	test-mlogloss:1.02015
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.492491	test-mlogloss:0.500773
Stoppi

In [20]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

173    0.496629
dtype: float64


In [22]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train.iloc[dev_index,:].as_matrix(), meta_train.iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=list(meta_train.columns),\
           early_stop = 20,num_rounds=780,eta = 0.1,max_depth=3,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.0208	test-mlogloss:1.02103
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.490906	test-mlogloss:0.502085
Stopping. Best iteration:
[91]	train-mlogloss:0.492516	test-mlogloss:0.502012

loss for the turn 1 is 0.502116002189
[0]	train-mlogloss:1.02117	test-mlogloss:1.02099
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.492325	test-mlogloss:0.49735
Stopping. Best iteration:
[143]	train-mlogloss:0.485708	test-mlogloss:0.496824

loss for the turn 2 is 0.496885017369
[0]	train-mlogloss:1.02092	test-mlogloss:1.02049
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.491575	test-mlogloss:0.500391
Stopping

In [24]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

161    0.496841
dtype: float64


In [25]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train[meta_features].iloc[dev_index,:].as_matrix(), meta_train[meta_features].iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,\
           early_stop = 20,num_rounds=780,eta = 0.1,max_depth=4,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.01991	test-mlogloss:1.02029
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.480154	test-mlogloss:0.501342
Stopping. Best iteration:
[101]	train-mlogloss:0.479939	test-mlogloss:0.501299

loss for the turn 1 is 0.501445885033
[0]	train-mlogloss:1.02031	test-mlogloss:1.02011
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.481668	test-mlogloss:0.497235
Stopping. Best iteration:
[111]	train-mlogloss:0.478996	test-mlogloss:0.497042

loss for the turn 2 is 0.497388884625
[0]	train-mlogloss:1.01987	test-mlogloss:1.01976
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.480093	test-mlogloss:0.500882
Stopp

In [27]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

111    0.503164
dtype: float64


In [29]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train[meta_features].iloc[dev_index,:].as_matrix(), meta_train[meta_features].iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,\
           early_stop = 20,num_rounds=780,eta = 0.1,max_depth=5,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.01891	test-mlogloss:1.01945
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.454882	test-mlogloss:0.502141
Stopping. Best iteration:
[90]	train-mlogloss:0.459796	test-mlogloss:0.501752

loss for the turn 1 is 0.502394617457
[0]	train-mlogloss:1.01911	test-mlogloss:1.01961
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.455619	test-mlogloss:0.496718
Stopping. Best iteration:
[91]	train-mlogloss:0.459829	test-mlogloss:0.496498

loss for the turn 2 is 0.497082633031
[0]	train-mlogloss:1.01878	test-mlogloss:1.01899
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.454768	test-mlogloss:0.500512
Stoppin

In [30]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

110    0.497042
dtype: float64


In [32]:
ss_list  = [0.25,0.5,0.75]
csb_list = [0.25,0.5,0.75]
another_dict = {}

for ss in ss_list:
     for csb in csb_list:
        cv_scores = []
        cv_result = []

        i=0        
        for dev_index, val_index in KF.split(meta_train,meta_train_y): 
            result_dict = {}

            dev_X, val_X = meta_train[meta_features].iloc[dev_index,:].as_matrix(), meta_train[meta_features].iloc[val_index,:].as_matrix()
            dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
            preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,
               early_stop = 20,num_rounds=780,eta = 0.1,max_depth=3,cv_dict = result_dict,verbose_eval=100,
                                cb=csb,sb=ss)

            loss = log_loss(val_y, preds)
    
            cv_scores.append(loss)
            cv_result.append(result_dict)
            i+=1
            print 'loss for the turn '+str(i)+' is '+str(loss)
    
        print 'The mean of the cv_scores is:'
        print np.mean(cv_scores) 
        another_dict[(ss,csb)]=np.mean(cv_scores)

[0]	train-mlogloss:1.02077	test-mlogloss:1.02094
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.493798	test-mlogloss:0.502639
Stopping. Best iteration:
[97]	train-mlogloss:0.494143	test-mlogloss:0.50241

loss for the turn 1 is 0.502522365694
[0]	train-mlogloss:1.02091	test-mlogloss:1.02046
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.49512	test-mlogloss:0.497614
Stopping. Best iteration:
[174]	train-mlogloss:0.485616	test-mlogloss:0.496537

loss for the turn 2 is 0.497178686073
[0]	train-mlogloss:1.02067	test-mlogloss:1.02044
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.494544	test-mlogloss:0.501161
Stopping

In [33]:
mini = 10
best_ss = 0
best_csb =0
for ss in ss_list:
    for csb in csb_list:
        if another_dict[(ss,csb)] < mini:
            mini = another_dict[(ss,csb)]
            best_ss = ss
            best_csb = csb

In [34]:
print best_ss,best_csb

0.5 0.75


In [101]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train.iloc[dev_index,:].as_matrix(), meta_train.iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,\
           early_stop = None,num_rounds=2000,eta = 0.01,max_depth=3,cv_dict = result_dict,verbose_eval=100,
                        sb=0.5,cb=0.75)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

ValueError: feature_names must have the same length as data

In [39]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

1197    0.504237
dtype: float64


In [49]:
meta_features

['knn4_high',
 'knn4_medium',
 'knn4_low',
 'knn8_high',
 'knn8_medium',
 'knn8_low',
 'knn16_high',
 'knn16_medium',
 'knn16_low',
 'knn32_high',
 'knn32_medium',
 'knn32_low',
 'xgb142_high',
 'xgb142_medium',
 'xgb142_low',
 'et1000mf140_high',
 'et1000mf140_medium',
 'et1000mf140_low',
 'loglrC03_high',
 'loglrC03_medium',
 'loglrC03_low',
 'lrl2C3_high',
 'lrl2C3_medium',
 'lrl2C3_low',
 'rf1000mf70_high',
 'rf1000mf70_medium',
 'rf1000mf70_low',
 'ann_high',
 'ann_medium',
 'ann_low',
 'lgb145_high',
 'lgb145_medium',
 'lgb145_low']

In [50]:
train_X, test_X = meta_train[meta_features].as_matrix(), meta_test[meta_features].as_matrix()

preds, model = runXGB(train_X, meta_train_y, test_X,
num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100,
                     sb=0.5,cb=0.75)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
#out_df.to_json('stack-.json')
out_df["listing_id"] = test_listing.values
out_df.to_csv("stack-beta-0.01eta-3mdsb5cb7.csv", index=False)

In [42]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
#chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglr','lrl1C1','rf1000mf70','ann','lgbm']
for feature in meta_train:
    for model in chosen_model:
        if model in feature and 'low' not in feature:
            print model,feature
            meta_features.append(feature)

knn4 knn4_high
knn4 knn4_medium
knn8 knn8_high
knn8 knn8_medium
knn16 knn16_high
knn16 knn16_medium
knn32 knn32_high
knn32 knn32_medium
xgb142 xgb142_high
xgb142 xgb142_medium
et1000mf140 et1000mf140_high
et1000mf140 et1000mf140_medium
loglrC03 loglrC03_high
loglrC03 loglrC03_medium
lrl2C3 lrl2C3_high
lrl2C3 lrl2C3_medium
rf1000mf70 rf1000mf70_high
rf1000mf70 rf1000mf70_medium
ann ann_high
ann ann_medium
lgb145 lgb145_high
lgb145 lgb145_medium


In [43]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train[meta_features].iloc[dev_index,:].as_matrix(), meta_train[meta_features].iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=meta_features,\
           early_stop = 20,num_rounds=780,eta = 0.1,max_depth=3,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.02146	test-mlogloss:1.02165
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.49262	test-mlogloss:0.502082
Stopping. Best iteration:
[146]	train-mlogloss:0.48598	test-mlogloss:0.50165

loss for the turn 1 is 0.501907494194
[0]	train-mlogloss:1.02133	test-mlogloss:1.02106
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.493241	test-mlogloss:0.497715
Stopping. Best iteration:
[162]	train-mlogloss:0.484375	test-mlogloss:0.497247

loss for the turn 2 is 0.497248538329
[0]	train-mlogloss:1.02136	test-mlogloss:1.021
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.492657	test-mlogloss:0.501183
Stopping. 

In [44]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

181    0.497268
dtype: float64


In [45]:
np.sum(out_df,axis=0)

high          5.874475e+03
medium        1.707263e+04
low           5.171307e+04
listing_id    5.244067e+11
dtype: float32

In [51]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features_explow = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
#chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglr','lrl1C1','rf1000mf70','ann','lgbm']
for feature in meta_train:
    for model in chosen_model:
        if model in feature and 'low' not in feature:
            print model,feature
            meta_features.append(feature)

knn4 knn4_high
knn4 knn4_medium
knn8 knn8_high
knn8 knn8_medium
knn16 knn16_high
knn16 knn16_medium
knn32 knn32_high
knn32 knn32_medium
xgb142 xgb142_high
xgb142 xgb142_medium
et1000mf140 et1000mf140_high
et1000mf140 et1000mf140_medium
loglrC03 loglrC03_high
loglrC03 loglrC03_medium
lrl2C3 lrl2C3_high
lrl2C3 lrl2C3_medium
rf1000mf70 rf1000mf70_high
rf1000mf70 rf1000mf70_medium
ann ann_high
ann ann_medium
lgb145 lgb145_high
lgb145 lgb145_medium


In [52]:
train_X, test_X = meta_train[meta_features].as_matrix(), meta_test[meta_features].as_matrix()

preds, model = runXGB(train_X, meta_train_y, test_X,
num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

out_df_explow = pd.DataFrame(preds)
out_df_explow.columns = ["high", "medium", "low"]
#out_df.to_json('stack-.json')
out_df_explow["listing_id"] = test_listing.values
#out_df.to_csv("stack-beta-0.01eta-3mdsb5cb7.csv", index=False)

In [53]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
#chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglr','lrl1C1','rf1000mf70','ann','lgbm']
for feature in meta_train:
    for model in chosen_model:
        if model in feature and 'medium' not in feature:
            print model,feature
            meta_features.append(feature)

knn4 knn4_high
knn4 knn4_low
knn8 knn8_high
knn8 knn8_low
knn16 knn16_high
knn16 knn16_low
knn32 knn32_high
knn32 knn32_low
xgb142 xgb142_high
xgb142 xgb142_low
et1000mf140 et1000mf140_high
et1000mf140 et1000mf140_low
loglrC03 loglrC03_high
loglrC03 loglrC03_low
lrl2C3 lrl2C3_high
lrl2C3 lrl2C3_low
rf1000mf70 rf1000mf70_high
rf1000mf70 rf1000mf70_low
ann ann_high
ann ann_low
lgb145 lgb145_high
lgb145 lgb145_low


In [54]:
train_X, test_X = meta_train[meta_features].as_matrix(), meta_test[meta_features].as_matrix()

preds, model = runXGB(train_X, meta_train_y, test_X,
num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

out_df_expmed = pd.DataFrame(preds)
out_df_expmed.columns = ["high", "medium", "low"]
#out_df.to_json('stack-.json')
out_df_expmed["listing_id"] = test_listing.values
#out_df.to_csv("stack-beta-0.01eta-3mdsb5cb7.csv", index=False)

In [55]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
#chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglr','lrl1C1','rf1000mf70','ann','lgbm']
for feature in meta_train:
    for model in chosen_model:
        if model in feature and 'high' not in feature:
            print model,feature
            meta_features.append(feature)

knn4 knn4_medium
knn4 knn4_low
knn8 knn8_medium
knn8 knn8_low
knn16 knn16_medium
knn16 knn16_low
knn32 knn32_medium
knn32 knn32_low
xgb142 xgb142_medium
xgb142 xgb142_low
et1000mf140 et1000mf140_medium
et1000mf140 et1000mf140_low
loglrC03 loglrC03_medium
loglrC03 loglrC03_low
lrl2C3 lrl2C3_medium
lrl2C3 lrl2C3_low
rf1000mf70 rf1000mf70_medium
rf1000mf70 rf1000mf70_low
ann ann_medium
ann ann_low
lgb145 lgb145_medium
lgb145 lgb145_low


In [56]:
train_X, test_X = meta_train[meta_features].as_matrix(), meta_test[meta_features].as_matrix()

preds, model = runXGB(train_X, meta_train_y, test_X,
num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

out_df_exphi = pd.DataFrame(preds)
out_df_exphi.columns = ["high", "medium", "low"]
#out_df.to_json('stack-.json')
out_df_exphi["listing_id"] = test_listing.values
#out_df.to_csv("stack-beta-0.01eta-3mdsb5cb7.csv", index=False)

In [60]:
out_df_avg=pd.DataFrame()

In [59]:
pearsonr(out_df_exphi['low'],out_df_expmed['low'])

(0.99776381, 0.0)

In [61]:
out_df_avg['high']=out_df_exphi['high']+out_df_expmed['high']+out_df_explow['high']
out_df_avg['medium']=out_df_exphi['medium']+out_df_expmed['medium']+out_df_explow['medium']
out_df_avg['low']=out_df_exphi['low']+out_df_expmed['low']+out_df_explow['low']
out_df_avg["listing_id"] = test_listing.values
out_df_avg.to_csv("stack-beta-0.01eta-3mdsb5cb7-avg.csv", index=False)

In [62]:
interest_levels = ['low', 'medium', 'high']

tau = {
    'low': 0.69195995, 
    'medium': 0.23108864,
    'high': 0.07695141, 
}

def correct(df):
    y = df[interest_levels].mean()
    a = [tau[k] / y[k]  for k in interest_levels]
    print a

    def f(p):
        for k in range(len(interest_levels)):
            p[k] *= a[k]
        return p / p.sum()

    df_correct = df.copy()
    df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1)

    y = df_correct[interest_levels].mean()
    a = [tau[k] / y[k]  for k in interest_levels]
    print a

    return df_correct

In [63]:
out_df = correct(out_df_avg)
out_df.to_csv("stack-beta-0.01eta-3mdsb5cb7-avg.csv", index=False)

[0.33296415556942704, 0.33679921014048969, 0.32652470184604393]
[1.0000620707379397, 1.0008657237176575, 0.99667484535127282]


In [65]:
#model_list=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl1C1','lrl2C3','rf1000mf70','nn','lgbm'
#           ,'lr4','loglr','et2000','rf2000','lgb145']
meta_features_explow = []
meta_features_expmed = []
meta_features_exphi = []
chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglrC03','lrl2C3','rf1000mf70','ann','lgb145']
#chosen_model=['knn4','knn8','knn16','knn32','xgb142','et1000mf140','loglr','lrl1C1','rf1000mf70','ann','lgbm']
for feature in meta_train:
    for model in chosen_model:
        if model in feature and 'low' not in feature:
            meta_features_explow.append(feature)
        if model in feature and 'medium' not in feature:
            meta_features_expmed.append(feature)
        if model in feature and 'high' not in feature:
            meta_features_exphi.append(feature)

In [78]:
TKF=StratifiedKFold(5,shuffle=True,random_state = 3212)
out_dfs=[]
for train_id,test_id in TKF.split(meta_train,meta_train_y): 
    
    train_X, test_X = meta_train[meta_features_explow].iloc[train_id].as_matrix(), meta_test[meta_features_explow].as_matrix()

    preds, model = runXGB(train_X, meta_train_y[train_id], test_X,
    num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

    out_df_explow = pd.DataFrame(preds)
    out_df_explow.columns = ["high", "medium", "low"]
    out_df_explow["listing_id"] = test_listing.values
    
    out_dfs.append(out_df_explow)
    
    train_X, test_X = meta_train[meta_features_expmed].iloc[train_id].as_matrix(), meta_test[meta_features_expmed].as_matrix()

    preds, model = runXGB(train_X, meta_train_y[train_id],test_X,
    num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

    out_df_expmed = pd.DataFrame(preds)
    out_df_expmed.columns = ["high", "medium", "low"]
    out_df_expmed["listing_id"] = test_listing.values
    
    out_dfs.append(out_df_expmed)
    
    train_X, test_X = meta_train[meta_features_exphi].iloc[train_id].as_matrix(), meta_test[meta_features_exphi].as_matrix()

    preds, model = runXGB(train_X, meta_train_y[train_id], test_X,
    num_rounds = 1200, eta = 0.01,max_depth = 3,verbose_eval=100
                     ,sb=0.5,cb=0.75)

    out_df_exphi = pd.DataFrame(preds)
    out_df_exphi.columns = ["high", "medium", "low"]
    out_df_exphi["listing_id"] = test_listing.values
    
    out_dfs.append(out_df_exphi)

In [81]:
len(out_dfs)

15

In [85]:
len([0 for i in range(len(out_dfs[0]))])

74659

In [93]:
out_df_avg2=pd.DataFrame()
out_df_avg2['high'] =[0 for i in range(len(out_dfs[0]))]
out_df_avg2['medium'] =[0 for i in range(len(out_dfs[0]))]
out_df_avg2['low'] =[0 for i in range(len(out_dfs[0]))]



In [95]:
for i in range(len(out_dfs)):
    out_df_avg2['high']+=out_dfs[0]['high']
    out_df_avg2['medium']+=out_dfs[0]['medium']
    out_df_avg2['low']+=out_dfs[0]['low']

In [96]:
out_df_avg2['high']=out_df_avg2['high']/15
out_df_avg2['medium']=out_df_avg2['medium']/15
out_df_avg2['low']=out_df_avg2['low']/15

In [98]:
out_df_avg2["listing_id"] = test_listing.values


In [99]:
out_df_avg2.to_csv("stack-beta-0.01eta-3mdsb5cb7-bagging.csv", index=False)