In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold
import pickle
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb



In [30]:
from mochi import CVstatistics

In [19]:
#try xgboost    

def runLGBM(train_X, train_y, test_X, test_y=None, feature_names=None,
           seed_val=0, num_rounds=10000,watch_dict = None,max_bin=50000,
           max_depth=4,early_stop=64,verbose=True,eta=0.1):
    param = {'learning_rate':eta, 
             'max_depth':max_depth,
             'application':'multiclass',
             'num_class':3,
             'metric':'multi_logloss',
              'num_threads':4}
    
    num_rounds = num_rounds

    #plst = list(param.items())
    lgbtrain = lgb.Dataset(train_X, label=train_y,max_bin=max_bin,feature_name=feature_names)

    if test_y is not None:
        lgbtest = lgb.Dataset(test_X, label=test_y,max_bin=max_bin,feature_name=feature_names)
        watchlist = [lgbtrain,lgbtest]
        watchlist_name=['train','test']
        model = lgb.train(param, lgbtrain, num_rounds, watchlist,watchlist_name, early_stopping_rounds=early_stop,\
                         evals_result = watch_dict,verbose_eval=verbose)
    else:
        #lgbtest = lgb.Dataset(test_X,feature_name=feature_names)
        model = lgb.train(param, lgbtrain, num_rounds)

    pred_test_y = model.predict(test_X)
    return pred_test_y, model

In [4]:
#lodaing data
data_path = "/home/raku/kaggleData/2sigma/xgb142/"
store = "/home/raku/kaggleData/2sigma/lgbm/"
train_file = data_path + "xgb1.42-train.json"
test_file = data_path + "xgb1.42-test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

feature_file = data_path+'xgb142features.pickle'
fileObject = open(feature_file,'r') 
features = pickle.load(fileObject)
fileObject.close()

(49352, 293)
(74659, 292)


In [5]:
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(train_y,5,shuffle=True,random_state = 2333)



In [26]:
cv_scores=[]
cv_result=[]
i=0
for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    
    preds, model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=features,verbose=100,eta=0.02,
                          early_stop=None,num_rounds=3000,watch_dict=result_dict)
    
    """
    #save the pickles for futures use
    pickl_file = store+'rf2000-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()
    """
    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[100]	train's multi_logloss: 0.636503	test's multi_logloss: 0.649845
[200]	train's multi_logloss: 0.56796	test's multi_logloss: 0.58866
[300]	train's multi_logloss: 0.540214	test's multi_logloss: 0.567894
[400]	train's multi_logloss: 0.52217	test's multi_logloss: 0.555502
[500]	train's multi_logloss: 0.508789	test's multi_logloss: 0.547945
[600]	train's multi_logloss: 0.497688	test's multi_logloss: 0.542587
[700]	train's multi_logloss: 0.488546	test's multi_logloss: 0.538502
[800]	train's multi_logloss: 0.480775	test's multi_logloss: 0.53532
[900]	train's multi_logloss: 0.473766	test's multi_logloss: 0.532694
[1000]	train's multi_logloss: 0.46725	test's multi_logloss: 0.530681
[1100]	train's multi_logloss: 0.461307	test's multi_logloss: 0.529177
[1200]	train's multi_logloss: 0.455817	test's multi_logloss: 0.528073
[1300]	train's multi_logloss: 0.450436	test's multi_logloss: 0.527115
[1400]	train's multi_logloss: 0.445314	test's multi_logloss: 0.526373
[1500]	train's multi_logloss: 0.44

In [44]:
cvResult = CVstatistics(cv_result,'multi_logloss')
meanTestError = cvResult.result.filter(like='test').mean(axis=1)
meanTestError[meanTestError==np.min(meanTestError)]

2805    0.516073
dtype: float64

In [53]:
cv_scores=[]
cv_result=[]
i=0
for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    
    preds, model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=features,verbose=100,eta=0.1,
                          early_stop=None,num_rounds=600,watch_dict=result_dict)
    
    """
    #save the pickles for futures use
    pickl_file = store+'rf2000-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()
    """
    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[100]	train's multi_logloss: 0.507801	test's multi_logloss: 0.547764
[200]	train's multi_logloss: 0.466074	test's multi_logloss: 0.530546
[300]	train's multi_logloss: 0.437966	test's multi_logloss: 0.525532
[400]	train's multi_logloss: 0.413967	test's multi_logloss: 0.523429
[500]	train's multi_logloss: 0.393274	test's multi_logloss: 0.522885
[600]	train's multi_logloss: 0.374306	test's multi_logloss: 0.523549
loss for the turn 1 is 0.522707556516
[100]	train's multi_logloss: 0.510451	test's multi_logloss: 0.539206
[200]	train's multi_logloss: 0.468151	test's multi_logloss: 0.52004
[300]	train's multi_logloss: 0.439215	test's multi_logloss: 0.513042
[400]	train's multi_logloss: 0.414608	test's multi_logloss: 0.510413
[500]	train's multi_logloss: 0.393972	test's multi_logloss: 0.509464
[600]	train's multi_logloss: 0.375713	test's multi_logloss: 0.509674
loss for the turn 2 is 0.509744063282
[100]	train's multi_logloss: 0.508182	test's multi_logloss: 0.544172
[200]	train's multi_logloss:

In [54]:
cvResult = CVstatistics(cv_result,'multi_logloss')
meanTestError = cvResult.result.filter(like='test').mean(axis=1)
meanTestError[meanTestError==np.min(meanTestError)]

520    0.516471
dtype: float64

In [57]:
cv_scores=[]
cv_result=[]
i=0
for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    
    preds, model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=features,verbose=100,eta=0.02,
                          early_stop=None,num_rounds=2800,watch_dict=result_dict,max_depth=4)
    

    #save the pickles for futures use
    pickl_file = store+'lgbm-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()
    
    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

[100]	train's multi_logloss: 0.636503	test's multi_logloss: 0.649845
[200]	train's multi_logloss: 0.56796	test's multi_logloss: 0.58866
[300]	train's multi_logloss: 0.540214	test's multi_logloss: 0.567894
[400]	train's multi_logloss: 0.52217	test's multi_logloss: 0.555502
[500]	train's multi_logloss: 0.508789	test's multi_logloss: 0.547945
[600]	train's multi_logloss: 0.497688	test's multi_logloss: 0.542587
[700]	train's multi_logloss: 0.488546	test's multi_logloss: 0.538502
[800]	train's multi_logloss: 0.480775	test's multi_logloss: 0.53532
[900]	train's multi_logloss: 0.473766	test's multi_logloss: 0.532694
[1000]	train's multi_logloss: 0.46725	test's multi_logloss: 0.530681
[1100]	train's multi_logloss: 0.461307	test's multi_logloss: 0.529177
[1200]	train's multi_logloss: 0.455817	test's multi_logloss: 0.528073
[1300]	train's multi_logloss: 0.450436	test's multi_logloss: 0.527115
[1400]	train's multi_logloss: 0.445314	test's multi_logloss: 0.526373
[1500]	train's multi_logloss: 0.44

In [59]:
train_X, test_X = train_df[features].as_matrix(), test_df[features].as_matrix()

preds, model = runLGBM(train_X, train_y, test_X,\
feature_names=features,
num_rounds = 2800, eta = 0.02,max_depth = 4,verbose=100)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df.to_json(store+'lgbm142-bulk-out.json')
out_df["listing_id"] = test_df.listing_id.values
#out_df.to_csv("xgb_beta1point42-0.02.csv", index=False)

In [None]:
#lgbm using 145
