In [1]:
import os,sys
import re
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from mochi import CVstatistics

In [5]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None,      
           seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,     max_depth = 6,cv_dict = None,verbose_eval=True):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.3
    param['seed'] = seed_val
    param['nthread'] = 4
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,        
            early_stopping_rounds=early_stop,evals_result = cv_dict,verbose_eval = verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [2]:
#the data path list
data_path = '/home/raku/kaggleData/2sigma/'
model_list = ['et2000','knn4','knn8','knn16','knn32','lr4','xgb142','rf2000']

In [3]:
meta_train_temp_list=[]
meta_test_temp_list=[]
for model in model_list:
    dp = os.listdir(data_path+model+'/')
    fold_out_file = []
    for filename in dp:
        if re.match('\S+-5fold-out-\d.pickle',filename)!=None:
            fold_out_file.append(filename)
        elif re.match('\S+-bulk-out.json',filename)!=None:
            bulk_out_file = filename
            
    fold_out_file=sorted(fold_out_file)
    #print fold_out_file
    #print bulk_out_file
    #load the pickles and combine into meta_train
    #load the json bulk out into meta_test
    pickle_data = []
    json_data = pd.read_json(data_path+model+'/'+bulk_out_file)
    temp_meta_test=pd.DataFrame(json_data['high'])
    temp_meta_test.columns=[model+'_high']
    temp_meta_test[model+'_medium']=json_data['medium']
    temp_meta_test[model+'_low']=json_data['low']
    test_listing=json_data['listing_id']
    
    for pickle_file in fold_out_file:
        pickl_file = data_path+model+'/'+pickle_file
        fileObject = open(pickl_file,'r') 
        pickle_data.append(pd.DataFrame(pickle.load(fileObject)))   
        fileObject.close()
    temp_meta_train=pd.concat(pickle_data)
    temp_meta_train.columns=[model+'_high',model+'_medium',model+'_low']
    temp_meta_test.columns=[model+'_high',model+'_medium',model+'_low']
    meta_train_temp_list.append(temp_meta_train)
    meta_test_temp_list.append(temp_meta_test)
    
meta_train=pd.concat(meta_train_temp_list,axis=1)
meta_test=pd.concat(meta_test_temp_list,axis=1)

In [4]:
y_data = []
for pickle_file in ['y-5fold-out-0.pickle','y-5fold-out-1.pickle','y-5fold-out-2.pickle','y-5fold-out-3.pickle','y-5fold-out-4.pickle']:
    pickl_file = data_path+'/'+pickle_file
    fileObject = open(pickl_file,'r') 
    y_data.append(pd.DataFrame(pickle.load(fileObject)))   
    fileObject.close()
meta_train_y=np.array(pd.concat(y_data)[0])

In [27]:
highs=meta_train.filter(like='_high')
coeff_matrix={}
for features in highs.columns:
    temp_matrix={}
    for another_feat in highs.columns:
        temp_matrix[another_feat]=pearsonr(highs[features],highs[another_feat])[0]
    coeff_matrix[features]=temp_matrix
high_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
high_coeff_df.max()

et2000_high    0.937338
knn16_high     0.927173
knn32_high     0.927173
knn4_high      0.846329
knn8_high      0.890040
lr4_high       0.800894
rf2000_high    0.937338
xgb142_high    0.857448
dtype: float64

In [28]:
mediums=meta_train.filter(like='_medium')
coeff_matrix={}
for features in mediums.columns:
    temp_matrix={}
    for another_feat in mediums.columns:
        temp_matrix[another_feat]=pearsonr(mediums[features],mediums[another_feat])[0]
    coeff_matrix[features]=temp_matrix
medium_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
medium_coeff_df.max()

et2000_medium    0.927758
knn16_medium     0.934426
knn32_medium     0.934426
knn4_medium      0.849902
knn8_medium      0.894516
lr4_medium       0.861432
rf2000_medium    0.927758
xgb142_medium    0.862118
dtype: float64

In [29]:
lows=meta_train.filter(like='_low')
coeff_matrix={}
for features in lows.columns:
    temp_matrix={}
    for another_feat in lows.columns:
        temp_matrix[another_feat]=pearsonr(lows[features],lows[another_feat])[0]
    coeff_matrix[features]=temp_matrix
low_coeff_df = pd.DataFrame(coeff_matrix).replace(1.0,-1)
low_coeff_df.max()

et2000_low    0.954456
knn16_low     0.965450
knn32_low     0.965450
knn4_low      0.905308
knn8_low      0.940407
lr4_low       0.903785
rf2000_low    0.954456
xgb142_low    0.904685
dtype: float64

In [86]:
cv_scores = []
cv_result = []

i=0        
for dev_index, val_index in KF.split(meta_train,meta_train_y): 
    result_dict = {}

    dev_X, val_X = meta_train.iloc[dev_index,:].as_matrix(), meta_train.iloc[val_index,:].as_matrix()
    dev_y, val_y = meta_train_y[dev_index], meta_train_y[val_index]
    
    preds,model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=list(meta_train.columns),\
           early_stop = 20,num_rounds=780,eta = 0.02,max_depth=3,cv_dict = result_dict,verbose_eval=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)
    
print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[0]	train-mlogloss:1.08276	test-mlogloss:1.08281
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.566998	test-mlogloss:0.567776
[200]	train-mlogloss:0.513483	test-mlogloss:0.515286
[300]	train-mlogloss:0.502975	test-mlogloss:0.506824
[400]	train-mlogloss:0.497905	test-mlogloss:0.504258
[500]	train-mlogloss:0.494523	test-mlogloss:0.503364
[600]	train-mlogloss:0.4917	test-mlogloss:0.502913
Stopping. Best iteration:
[583]	train-mlogloss:0.49214	test-mlogloss:0.502901

loss for the turn 1 is 0.502910325005
[0]	train-mlogloss:1.08281	test-mlogloss:1.08278
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.567949	test-mlogloss:0.565155
[200]	train-mlogloss:0.514432	test-mlogloss:0.511628
[300]	train-mlogloss:0.504091	test-mlogloss:0.502522
[400]	tr

In [87]:
cvResult = CVstatistics(cv_result,'mlogloss')

meanTestError = cvResult.result.filter(like='test').mean(axis=1)

print meanTestError[meanTestError==np.min(meanTestError)]

676    0.498815
677    0.498815
dtype: float64


In [108]:
train_X, test_X = meta_train.as_matrix(), meta_test.as_matrix()

preds, model = runXGB(train_X, meta_train_y, test_X,
num_rounds = 670, eta = 0.02,max_depth = 3,verbose_eval=100)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
#out_df.to_json('stack-.json')
out_df["listing_id"] = test_listing.values
out_df.to_csv("stack-alpah-0.02eta-3md.csv", index=False)

In [112]:
np.sum(out_df,axis=0)

high          5.866707e+03
medium        1.708426e+04
low           5.170623e+04
listing_id    5.244067e+11
dtype: float32