In [1]:
import numpy as np
import pandas as pd
import time

#helpful libraries
from sklearn.metrics import f1_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold

#base models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [2]:
cv = 5

In [3]:
data = pd.read_csv('data/train_values.csv')
label = pd.read_csv('data/train_labels.csv')
test_values = pd.read_csv('data/test_values.csv')

to_drop = ['geo_level_2_id', 'geo_level_3_id']

to_enc = ['geo_level_1_id', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',\
          'plan_configuration', 'legal_ownership_status', 'land_surface_condition', 'position']

num_col = ['age', 'area_percentage', 'height_percentage', 'count_families', 'count_floors_pre_eq']

In [4]:
full_data = data.append(test_values).set_index(keys = 'building_id')
full_data.drop(columns=to_drop, inplace=True)
full_data = pd.get_dummies(full_data, prefix=to_enc, columns=to_enc)

In [5]:
train_building_id = data['building_id']
test_building_id = test_values['building_id']

data = full_data.loc[train_building_id]
test_values = full_data.loc[test_building_id]

del(full_data) # save memory

In [6]:
mlp = MLPClassifier(solver='sgd', activation='tanh', hidden_layer_sizes=(96, 40, 20, 10, 5), max_iter=300)
rf = RandomForestClassifier(n_estimators=44, max_depth=11, max_features=0.8, \
                               max_leaf_nodes=42, min_samples_split=10, bootstrap=False)
logreg = LogisticRegression(C=3, max_iter=300)
neigh = KNeighborsClassifier(n_neighbors=30)

meta = MLPClassifier(solver='sgd', activation='logistic', hidden_layer_sizes=(10,5), max_iter=300)

In [7]:
building_id = label['building_id'].copy()
label = label.set_index('building_id')
df = data.join(label, on='building_id')

In [24]:
kf = KFold(n_splits=cv)
i = 1
train_meta = pd.DataFrame(columns=['mlp', 'rf', 'logreg', 'neigh'], index=building_id)

for train_idx, test_idx in kf.split(df):
    
    start = time.time()
    
    print(i, ":")
    train = df.iloc[train_idx].copy()
    test = df.iloc[test_idx].copy()
    
    scaler = RobustScaler()
    train[num_col] = scaler.fit_transform(train[num_col])
    test[num_col] = scaler.transform(test[num_col])
    
    mlp.fit(train.iloc[:, :-1], train.iloc[:,-1])
    print("MLP Done")
    rf.fit(train.iloc[:, :-1], train.iloc[:,-1])
    print("RF Done")
    logreg.fit(train.iloc[:, :-1], train.iloc[:,-1])
    print("LR Done")
    neigh.fit(train.iloc[:, :-1], train.iloc[:,-1])
    print("KNN Done")
    
    mlp_pred = mlp.predict(test.iloc[:,:-1]).reshape(-1,1)
    rf_pred = rf.predict(test.iloc[:,:-1]).reshape(-1,1)
    logreg_pred = logreg.predict(test.iloc[:,:-1]).reshape(-1,1)
    neigh_pred = neigh.predict(test.iloc[:,:-1]).reshape(-1,1)
    
    print('MLP F1: ', f1_score(label.iloc[test_idx], mlp_pred, average='micro'))
    print('RF F1: ', f1_score(label.iloc[test_idx], rf_pred, average='micro'))
    print('LR F1: ', f1_score(label.iloc[test_idx], logreg_pred, average='micro'))
    print('KNN F1: ', f1_score(label.iloc[test_idx], neigh_pred, average='micro'))
    
    print(i, ' done, took ', (time.time()-start)/60, ' min')
    print('**********************')
    i += 1
    
    train_meta.iloc[test_idx] = np.concatenate((mlp_pred, rf_pred, logreg_pred, neigh_pred), axis=1)

1 :




MLP Done
RF Done




LR Done
KNN Done
MLP F1:  0.6923696782486906
RF F1:  0.6732027397785921
LR F1:  0.6688091172464073
KNN F1:  0.6847527867845974
1  done, took  26.391796652475993  min
**********************
2 :




MLP Done
RF Done




LR Done
KNN Done
MLP F1:  0.6913852647735994
RF F1:  0.6710475825019186
LR F1:  0.6669992325402916
KNN F1:  0.6819071373752879
2  done, took  28.442806621392567  min
**********************
3 :




MLP Done
RF Done




LR Done
KNN Done
MLP F1:  0.6957789716039908
RF F1:  0.6731389102072142
LR F1:  0.6682463545663853
KNN F1:  0.6854182655410591
3  done, took  26.845238240559897  min
**********************
4 :




MLP Done
RF Done




LR Done
KNN Done
MLP F1:  0.6936684574059861
RF F1:  0.6730046047582502
LR F1:  0.6702033768227168
KNN F1:  0.6880851880276285
4  done, took  32.99758932987849  min
**********************
5 :




MLP Done
RF Done




LR Done
KNN Done
MLP F1:  0.6910974673829624
RF F1:  0.6734842670759785
LR F1:  0.6708940905602456
KNN F1:  0.6855141980046048
5  done, took  31.915700682004292  min
**********************


In [28]:
train_meta.to_csv('train_meta.csv')
train_meta

Unnamed: 0_level_0,mlp,rf,logreg,neigh
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
802906,2,2,2,3
28830,3,3,3,3
94947,3,3,3,3
590882,2,2,2,2
201944,3,2,3,3
...,...,...,...,...
688636,2,2,2,2
669485,3,3,3,3
602512,3,3,3,3
151409,2,2,2,2


In [34]:
df_meta = train_meta.join(label, on='building_id')
meta.fit(df_meta.iloc[:,0:4], df_meta.iloc[:,-1])

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
#Now retrain the base models and predict with the provided submit values

submit = test_values.copy()
X = df.copy()

scaler = RobustScaler()
X[num_col] = scaler.fit_transform(X[num_col])
submit[num_col] = scaler.transform(submit[num_col])

mlp.fit(X.iloc[:, :-1], X.iloc[:,-1])
print("MLP Done")
rf.fit(X.iloc[:, :-1], X.iloc[:,-1])
print("RF Done")
logreg.fit(X.iloc[:, :-1], X.iloc[:,-1])
print("LR Done")
neigh.fit(X.iloc[:, :-1], X.iloc[:,-1])
print("KNN Done")

In [49]:
mlp_pred = mlp.predict(submit).reshape(-1,1)
rf_pred = rf.predict(submit).reshape(-1,1)
logreg_pred = logreg.predict(submit).reshape(-1,1)
neigh_pred = neigh.predict(submit).reshape(-1,1)

In [50]:
to_submit = pd.DataFrame(columns=['mlp', 'rf', 'logreg', 'neigh'], index=submit.index)
to_submit.iloc[:] = np.concatenate((mlp_pred, rf_pred, logreg_pred, neigh_pred), axis=1)

In [53]:
final_pred = meta.predict(to_submit)

In [54]:
to_excel = pd.DataFrame(columns=['damage_grade'], index=submit.index)
to_excel['damage_grade'] = final_pred

In [56]:
to_excel.to_csv('submission.csv')

<h2>Try one-hot encode the train_meta data</h2>

In [64]:
train_meta_hot = pd.get_dummies(df_meta, columns=['mlp', 'rf', 'logreg', 'neigh'], prefix=['mlp', 'rf', 'logreg', 'neigh'])
meta.fit(train_meta_hot.iloc[:,1:], train_meta_hot.iloc[:,0])

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [66]:
submit_hot = pd.get_dummies(to_submit, columns=['mlp', 'rf', 'logreg', 'neigh'], prefix=['mlp', 'rf', 'logreg', 'neigh'])
final_pred = meta.predict(submit_hot)

In [68]:
to_excel = pd.DataFrame(columns=['damage_grade'], index=submit.index)
to_excel['damage_grade'] = final_pred
to_excel.to_csv('submission.csv')

<p> Not much difference: 0.6913, 0.6911. One-hot-encoding does not work. Overall performance increases slightly. Shows that stacking works, but more need to be done.</p>