In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold
import pickle
from mochi import *



In [2]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC

In [3]:
data_path = "/data/kaggleData/2sigma/"

train_file = data_path + "processed_train.json"
test_file = data_path + "processed_test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)



(49352, 292)
(74659, 291)


In [4]:
feature_dict = pd.read_pickle(data_path+'feature_set_dict.pkl')
features = []
for feature_set in feature_dict.keys():
    features.extend(feature_dict[feature_set])

In [5]:
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(train_y,5,shuffle=True,random_state = 2333)

train_df = train_df.fillna(-1)
test_df = test_df.fillna(-1)

In [None]:
store = data_path+'et1000mf140/'
cv_scores=[]
i=0

for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    et = ETC(1000,random_state=0,max_features =140)
    et.fit(dev_X,dev_y)
    preds = et.predict_proba(val_X)

    #save the pickles for futures use
    pickl_file = store+'et1000mf140-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

loss for the turn 3 is 0.586290364947
loss for the turn 4 is 0.578498577962


In [None]:
train_X, test_X = train_df[features].as_matrix(), test_df[features].as_matrix()

et = ETC(1000,random_state=0,max_features =140)
et.fit(train_X,train_y)
preds = et.predict_proba(test_X)


out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_json(store+'et1000mf140-bulk-out.json')

In [None]:
print 'test'

In [None]:
store = data_path+'rf1000mf70/'
cv_scores=[]
i=0

for dev_index, val_index in KF: 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
       #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    rf = RFC(1000,random_state=0,max_features =70)
    rf.fit(dev_X,dev_y)
    preds = rf.predict_proba(val_X)

    #save the pickles for futures use
    pickl_file = store+'rf1000mf70-5fold-out-'+str(i)+'.pickle'
    fileObject = open(pickl_file,'wb') 
    pickle.dump(preds,fileObject)   
    fileObject.close()

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    i+=1
    print'loss for the turn '+str(i)+' is '+str(loss)

print 'The mean of the cv_scores is:'
print np.mean(cv_scores)

loss for the turn 1 is 0.568396878458


In [None]:
train_X, test_X = train_df[features].as_matrix(), test_df[features].as_matrix()

rf = RFC(1000,random_state=0,max_features =70)
rf.fit(train_X,train_y)
preds = rf.predict_proba(test_X)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df.to_json(store+'rf1000mf70-bulk-out.json')
out_df["listing_id"] = test_df.listing_id.values