In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold
import pickle
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb





In [2]:
from mochi import CVstatistics

In [3]:
#try xgboost    

def runLGBM(train_X, train_y, test_X, test_y=None, feature_names=None,
           seed_val=0, num_rounds=10000,watch_dict = None,max_bin=50000,
           max_depth=4,early_stop=64,verbose=True,eta=0.1):
    param = {'learning_rate':eta, 
             'max_depth':max_depth,
             'application':'multiclass',
             'num_class':3,
             'metric':'multi_logloss',
              'num_threads':4}
    
    num_rounds = num_rounds

    #plst = list(param.items())
    lgbtrain = lgb.Dataset(train_X, label=train_y,max_bin=max_bin,feature_name=feature_names)

    if test_y is not None:
        lgbtest = lgb.Dataset(test_X, label=test_y,max_bin=max_bin,feature_name=feature_names)
        watchlist = [lgbtrain,lgbtest]
        watchlist_name=['train','test']
        model = lgb.train(param, lgbtrain, num_rounds, watchlist,watchlist_name, early_stopping_rounds=early_stop,\
                         evals_result = watch_dict,verbose_eval=verbose)
    else:
        #lgbtest = lgb.Dataset(test_X,feature_name=feature_names)
        model = lgb.train(param, lgbtrain, num_rounds)

    pred_test_y = model.predict(test_X)
    return pred_test_y, model

In [4]:
#lodaing data
data_path = "/home/raku/kaggleData/2sigma/xgb145/"
store = "/home/raku/kaggleData/2sigma/lgb145/"
train_file = data_path + "xgb1.45-train.json"
test_file = data_path + "xgb1.45-test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

feature_file = data_path+'xgb145features.pickle'
fileObject = open(feature_file,'r') 
features = pickle.load(fileObject)
fileObject.close()

(49352, 297)
(74659, 296)


In [5]:
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(train_y,5,shuffle=True,random_state = 2333)



In [6]:
cv_scores=[]
cv_result=[]
i=0
for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    
    preds, model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=features,verbose=100,eta=0.02,
                          early_stop=None,num_rounds=3500,watch_dict=result_dict)
    
    """
    #save the pickles for futures use
    pickl_file = store+'rf2000-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()
    """
    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[100]	train's multi_logloss: 0.636527	test's multi_logloss: 0.649748
[200]	train's multi_logloss: 0.567904	test's multi_logloss: 0.588765
[300]	train's multi_logloss: 0.539923	test's multi_logloss: 0.567937
[400]	train's multi_logloss: 0.521996	test's multi_logloss: 0.555618
[500]	train's multi_logloss: 0.508262	test's multi_logloss: 0.54795
[600]	train's multi_logloss: 0.497234	test's multi_logloss: 0.542569
[700]	train's multi_logloss: 0.488024	test's multi_logloss: 0.538455
[800]	train's multi_logloss: 0.480249	test's multi_logloss: 0.535246
[900]	train's multi_logloss: 0.473323	test's multi_logloss: 0.532662
[1000]	train's multi_logloss: 0.46691	test's multi_logloss: 0.530731
[1100]	train's multi_logloss: 0.461034	test's multi_logloss: 0.529325
[1200]	train's multi_logloss: 0.455492	test's multi_logloss: 0.528104
[1300]	train's multi_logloss: 0.450171	test's multi_logloss: 0.527149
[1400]	train's multi_logloss: 0.444904	test's multi_logloss: 0.526491
[1500]	train's multi_logloss: 0

In [7]:
cvResult = CVstatistics(cv_result,'multi_logloss')
meanTestError = cvResult.result.filter(like='test').mean(axis=1)
meanTestError[meanTestError==np.min(meanTestError)]

2576    0.516129
dtype: float64

In [8]:
cv_scores=[]
cv_result=[]
i=0
for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    
    preds, model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=features,verbose=100,eta=0.02,
                          early_stop=None,num_rounds=2500,watch_dict=result_dict,max_depth=4)
    

    #save the pickles for futures use
    pickl_file = store+'lgbm145-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()
    
    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[100]	train's multi_logloss: 0.636527	test's multi_logloss: 0.649748
[200]	train's multi_logloss: 0.567904	test's multi_logloss: 0.588765
[300]	train's multi_logloss: 0.539923	test's multi_logloss: 0.567937
[400]	train's multi_logloss: 0.521996	test's multi_logloss: 0.555618
[500]	train's multi_logloss: 0.508262	test's multi_logloss: 0.54795
[600]	train's multi_logloss: 0.497234	test's multi_logloss: 0.542569
[700]	train's multi_logloss: 0.488024	test's multi_logloss: 0.538455
[800]	train's multi_logloss: 0.480249	test's multi_logloss: 0.535246
[900]	train's multi_logloss: 0.473323	test's multi_logloss: 0.532662
[1000]	train's multi_logloss: 0.46691	test's multi_logloss: 0.530731
[1100]	train's multi_logloss: 0.461034	test's multi_logloss: 0.529325
[1200]	train's multi_logloss: 0.455492	test's multi_logloss: 0.528104
[1300]	train's multi_logloss: 0.450171	test's multi_logloss: 0.527149
[1400]	train's multi_logloss: 0.444904	test's multi_logloss: 0.526491
[1500]	train's multi_logloss: 0

In [10]:
train_X, test_X = train_df[features].as_matrix(), test_df[features].as_matrix()

preds, model = runLGBM(train_X, train_y, test_X,\
feature_names=features,
num_rounds = 2500, eta = 0.02,max_depth = 4,verbose=100)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df.to_json(store+'lgbm145-bulk-out.json')
out_df["listing_id"] = test_df.listing_id.values
#out_df.to_csv("xgb_beta1point42-0.02.csv", index=False)

In [None]:
#lgbm using 145
