In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [2]:
from mochi import runLGBM

## Loading the dataset

In [3]:
data_path = '/data/kaggleData/2sigma/'
train_df = pd.read_json(data_path+'processed_train.json')
test_df = pd.read_json(data_path+'processed_test.json')
feature_dict = pd.read_pickle(data_path+'feature_set_dict.pkl')

In [4]:
feature_dict.keys()

['manager_performance',
 'new_categoricals',
 'price_and_room_related',
 'cluseter_id',
 'basic_numerical',
 'created_time_derived',
 'basic_categorical',
 'spatial_sts',
 'unstructured_derived_numerical',
 'featured_derived',
 'manager_and_house_sts']

In [5]:
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(5,shuffle=True,random_state = 2333)

cv_scores = []
cv_result = []
models = []

## Raw Features (including ‘features’ mapped and counts for unstructured)

In [6]:
basic_feature_set = ['basic_numerical','created_time_derived',
                     'basic_categorical','unstructured_derived_numerical','featured_derived']
basic_features = []
for key in basic_feature_set:
    basic_features.extend(feature_dict[key])

In [7]:
i=0
cv_scores = []
cv_result = []

for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[basic_features].as_matrix(), val_set[basic_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=basic_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.555713	test's multi_logloss: 0.587134
[200]	train's multi_logloss: 0.516253	test's multi_logloss: 0.56759
[300]	train's multi_logloss: 0.489964	test's multi_logloss: 0.559788
[400]	train's multi_logloss: 0.469303	test's multi_logloss: 0.556091
[500]	train's multi_logloss: 0.451746	test's multi_logloss: 0.554299
[600]	train's multi_logloss: 0.436276	test's multi_logloss: 0.55356
[700]	train's multi_logloss: 0.421358	test's multi_logloss: 0.55321
Early stopping, best iteration is:
[686]	train's multi_logloss: 0.423435	test's multi_logloss: 0.55299
loss for the turn 1 is 0.548671599414
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.558507	test's multi_logloss: 0.576962
[200]	train's multi_logloss: 0.517453	test's multi_logloss: 0.554888
[300]	train's multi_logloss: 0.491879	test's multi_logloss: 0.546408
[400]	train's multi_logloss: 0.471588	test's multi_logloss: 0.54

## Adding price related constructed features

In [11]:
adding_price_set = list(basic_feature_set)
adding_price_set.append('price_and_room_related')

adding_price_features = []
for key in adding_price_set:
    adding_price_features.extend(feature_dict[key])

In [14]:
i=0
cv_scores = []
cv_result = []

for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[adding_price_features].as_matrix(), val_set[adding_price_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=adding_price_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.545819	test's multi_logloss: 0.579317
[200]	train's multi_logloss: 0.508009	test's multi_logloss: 0.563076
[300]	train's multi_logloss: 0.482651	test's multi_logloss: 0.556591
[400]	train's multi_logloss: 0.463186	test's multi_logloss: 0.553315
[500]	train's multi_logloss: 0.445305	test's multi_logloss: 0.551408
Early stopping, best iteration is:
[521]	train's multi_logloss: 0.441695	test's multi_logloss: 0.551262
loss for the turn 1 is 0.547390404406
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.547808	test's multi_logloss: 0.569074
[200]	train's multi_logloss: 0.510387	test's multi_logloss: 0.550622
[300]	train's multi_logloss: 0.485281	test's multi_logloss: 0.542517
[400]	train's multi_logloss: 0.465629	test's multi_logloss: 0.538584
[500]	train's multi_logloss: 0.447593	test's multi_logloss: 0.535792
[600]	train's multi_logloss: 0.432283	test's multi_logloss: 

## Adding house type features

In [15]:
adding_house_set = list(adding_price_set)
adding_house_set.append('new_categoricals')

adding_house_features = []
for key in adding_house_set:
    adding_house_features.extend(feature_dict[key])

In [16]:
i=0
cv_scores = []
cv_result = []

for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[adding_house_features].as_matrix(), val_set[adding_house_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=adding_house_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.544778	test's multi_logloss: 0.578503
[200]	train's multi_logloss: 0.506399	test's multi_logloss: 0.561325
[300]	train's multi_logloss: 0.481593	test's multi_logloss: 0.55615
[400]	train's multi_logloss: 0.461233	test's multi_logloss: 0.553128
[500]	train's multi_logloss: 0.444054	test's multi_logloss: 0.551916
[600]	train's multi_logloss: 0.428112	test's multi_logloss: 0.551351
[700]	train's multi_logloss: 0.414177	test's multi_logloss: 0.551142
Early stopping, best iteration is:
[690]	train's multi_logloss: 0.41547	test's multi_logloss: 0.551041
loss for the turn 1 is 0.546892167444
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.547694	test's multi_logloss: 0.569577
[200]	train's multi_logloss: 0.508428	test's multi_logloss: 0.549079
[300]	train's multi_logloss: 0.483756	test's multi_logloss: 0.541801
[400]	train's multi_logloss: 0.464103	test's multi_logloss: 0.

## Adding manager performance related features

In [17]:
adding_manager_set = list(adding_house_set)
adding_manager_set.append('manager_performance')

adding_manager_features = []
for key in adding_manager_set:
    adding_manager_features.extend(feature_dict[key])

In [19]:
i=0
cv_scores = []
cv_result = []

for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[adding_manager_features].as_matrix(), val_set[adding_manager_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=adding_manager_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.517643	test's multi_logloss: 0.554626
[200]	train's multi_logloss: 0.479484	test's multi_logloss: 0.539618
[300]	train's multi_logloss: 0.454555	test's multi_logloss: 0.534346
[400]	train's multi_logloss: 0.43339	test's multi_logloss: 0.532769
[500]	train's multi_logloss: 0.41515	test's multi_logloss: 0.531557
[600]	train's multi_logloss: 0.398862	test's multi_logloss: 0.531195
Early stopping, best iteration is:
[568]	train's multi_logloss: 0.403846	test's multi_logloss: 0.530926
loss for the turn 1 is 0.527073671449
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.519886	test's multi_logloss: 0.54516
[200]	train's multi_logloss: 0.481293	test's multi_logloss: 0.528257
[300]	train's multi_logloss: 0.456097	test's multi_logloss: 0.522805
[400]	train's multi_logloss: 0.435068	test's multi_logloss: 0.520191
[500]	train's multi_logloss: 0.4162	test's multi_logloss: 0.518

## Adding spatial clustering features

In [20]:
adding_spatial_set = list(adding_manager_set)
adding_spatial_set.append('cluseter_id')

adding_spatial_features = []
for key in adding_spatial_set:
    adding_spatial_features.extend(feature_dict[key])

In [21]:
i=0
cv_scores = []
cv_result = []

for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[adding_spatial_features].as_matrix(), val_set[adding_spatial_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=adding_spatial_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.516621	test's multi_logloss: 0.554834
[200]	train's multi_logloss: 0.478595	test's multi_logloss: 0.539672
[300]	train's multi_logloss: 0.453448	test's multi_logloss: 0.535209
[400]	train's multi_logloss: 0.43214	test's multi_logloss: 0.532748
[500]	train's multi_logloss: 0.41381	test's multi_logloss: 0.532801
Early stopping, best iteration is:
[480]	train's multi_logloss: 0.417503	test's multi_logloss: 0.532477
loss for the turn 1 is 0.527889795876
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.518996	test's multi_logloss: 0.54513
[200]	train's multi_logloss: 0.480224	test's multi_logloss: 0.528381
[300]	train's multi_logloss: 0.454318	test's multi_logloss: 0.52286
[400]	train's multi_logloss: 0.433385	test's multi_logloss: 0.52022
[500]	train's multi_logloss: 0.414455	test's multi_logloss: 0.518984
[600]	train's multi_logloss: 0.397886	test's multi_logloss: 0.518

## Adding some statistical Features

In [22]:
adding_statistical_set = list(adding_spatial_set)
adding_statistical_set.append('spatial_sts')
adding_statistical_set.append('manager_and_house_sts')

adding_statistical_features = []
for key in adding_statistical_set:
    adding_statistical_features.extend(feature_dict[key])


In [23]:
i=0
cv_scores = []
cv_result = []
    
for dev_index, val_index in KF.split(train_df,train_y): 
    result_dict = {}
    
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
    
    dev_X, val_X = dev_set[adding_statistical_features].as_matrix(), val_set[adding_statistical_features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    preds,model = runLGBM(dev_X, dev_y, val_X, val_y,feature_names=adding_statistical_features,
           early_stop = 64,num_rounds=4500,eta = 0.1,max_depth=4,
                          watch_dict = result_dict,verbose=100)

    loss = log_loss(val_y, preds)
    
    cv_scores.append(loss)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print 'loss for the turn '+str(i)+' is '+str(loss)

print 'the mean loss for the validation is '+str(np.mean(cv_scores))

Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.507995	test's multi_logloss: 0.547484
[200]	train's multi_logloss: 0.466921	test's multi_logloss: 0.531704
[300]	train's multi_logloss: 0.439782	test's multi_logloss: 0.526495
[400]	train's multi_logloss: 0.417351	test's multi_logloss: 0.525621
[500]	train's multi_logloss: 0.397054	test's multi_logloss: 0.525091
Early stopping, best iteration is:
[497]	train's multi_logloss: 0.397693	test's multi_logloss: 0.525087
loss for the turn 1 is 0.523948000718
Train until valid scores didn't improve in 64 rounds.
[100]	train's multi_logloss: 0.510864	test's multi_logloss: 0.539602
[200]	train's multi_logloss: 0.469369	test's multi_logloss: 0.522225
[300]	train's multi_logloss: 0.441	test's multi_logloss: 0.51619
[400]	train's multi_logloss: 0.418442	test's multi_logloss: 0.513982
[500]	train's multi_logloss: 0.398159	test's multi_logloss: 0.513015
Early stopping, best iteration is:
[524]	train's multi_logloss: 