In [26]:
#get all necessary train and test data from the data preprocessing file
%run ./Preprocess_keep_features_except_geo_2_3.ipynb
%run ./Preprocess.ipynb

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import common

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## Training a MLPClassifier

### Baseline model using only geo_level_1_id and removed features, UNSAMPLED

In [11]:
clf = MLPClassifier(solver='sgd', max_iter=100, hidden_layer_sizes=(90,10), activation='tanh', early_stopping=True)
clf.fit(train_x, train_y)
y_pred = clf.predict(test_x)
print("Model accuracy on train data: {:.2f}%".format(clf.score(train_x, train_y)*100))
print("Model accuracy on test data: {:.2f}%".format(clf.score(test_x, test_y)*100))
print("\n\n\t\tclassification_report on test data\n")
print(classification_report(test_y, y_pred))
print("Confusion matrix\n")
print(confusion_matrix(test_y, y_pred))
print()
print("F1-Micro: ", f1_score(test_y, y_pred, average='micro'))

Model accuracy on train data: 66.84%
Model accuracy on test data: 66.73%


		classification_report on test data

              precision    recall  f1-score   support

           1       0.59      0.34      0.43      5041
           2       0.67      0.81      0.74     29640
           3       0.67      0.52      0.58     17440

    accuracy                           0.67     52121
   macro avg       0.64      0.56      0.58     52121
weighted avg       0.66      0.67      0.66     52121

Confusion matrix

[[ 1713  3241    87]
 [ 1164 24022  4454]
 [   51  8342  9047]]

F1-Micro:  0.6673317856526161


### Baseline model using only geo_level_1_id and removed features, OVERSAMPLED

In [6]:
clf = MLPClassifier(solver='sgd', max_iter=100, hidden_layer_sizes=(90,10), activation='tanh', early_stopping=True)
clf.fit(train_x_over, train_y_over)
y_pred_over = clf.predict(test_x_over)
print("Model accuracy on train data: {:.2f}%".format(clf.score(train_x_over, train_y_over)*100))
print("Model accuracy on test data: {:.2f}%".format(clf.score(test_x_over, test_y_over)*100))

y_pred = clf.predict(test_x_over)
print("\n\n\t\tclassification_report on test data\n")
print(classification_report(test_y_over, y_pred))
print("Confusion matrix\n")
print(confusion_matrix(test_y_over, y_pred))
print()
print("F1-Micro: ", f1_score(test_y_over, y_pred, average='micro'))
print("Done")

Model accuracy on train data: 66.50%
Model accuracy on test data: 59.32%


		classification_report on test data

              precision    recall  f1-score   support

           1       0.35      0.79      0.49      5005
           2       0.74      0.50      0.59     29641
           3       0.59      0.70      0.64     17475

    accuracy                           0.59     52121
   macro avg       0.56      0.66      0.57     52121
weighted avg       0.65      0.59      0.60     52121

Confusion matrix

[[ 3966   898   141]
 [ 6469 14748  8424]
 [  888  4384 12203]]

F1-Micro:  0.5931774140941272
Done


### Baseline model using only geo_level_1_id and all features, UNSAMPLED

In [9]:
clf = MLPClassifier(solver='sgd', max_iter=100, hidden_layer_sizes=(90,10), activation='tanh', early_stopping=True)
clf.fit(train_x_keep, train_y_keep)
y_pred_over = clf.predict(test_x_keep)
print("Model accuracy on train data: {:.2f}%".format(clf.score(train_x_keep, train_y_keep)*100))
print("Model accuracy on test data: {:.2f}%".format(clf.score(test_x_keep, test_y_keep)*100))

y_pred = clf.predict(test_x_keep)
print("\n\n\t\tclassification_report on test data\n")
print(classification_report(test_y_keep, y_pred))
print("Confusion matrix\n")
print(confusion_matrix(test_y_keep, y_pred))
print()
print("F1-Micro: ", f1_score(test_y_keep, y_pred, average='micro'))
print("Done")

Model accuracy on train data: 68.13%
Model accuracy on test data: 67.64%


		classification_report on test data

              precision    recall  f1-score   support

           1       0.62      0.35      0.45      5031
           2       0.68      0.83      0.75     29730
           3       0.68      0.51      0.58     17359

    accuracy                           0.68     52120
   macro avg       0.66      0.56      0.59     52120
weighted avg       0.67      0.68      0.66     52120

Confusion matrix

[[ 1752  3208    71]
 [ 1012 24638  4080]
 [   67  8430  8862]]

F1-Micro:  0.6763622409823484
Done


### Conclusion:
<p> The baseline model using only geo_level_1_id and all features without unsampled data is the best model among the three. We will optimise this model by selecting the hyperparameters.</p>

## Optimising the MLPClassifier
### Using GridSearch to find the optimal hyperparameters

<p>Find the best hidden layer sizes, number of hidden layers, and alpha.</p>
<p>As it takes a long time to train the models for the GridSearch, the cross-validation is done 2-fold only. </p>

### Training with all the features and only one-hot encoded geo_level_1_id

In [8]:
layer = [(96,40), (96,20), (48,25), (96,40,40), (96,40,20), (48,20,10), \
        (96,40,20,10), (96,40,20,10,5), (40,20,20), (20,20,20)]

hyperparameters = dict(alpha= 10**(np.arange(-6,2,1.0)), hidden_layer_sizes=layer)
mlp = MLPClassifier(solver='sgd', max_iter=100, activation='tanh', early_stopping=True)
clf = GridSearchCV(mlp, hyperparameters, cv=2, scoring = 'f1_micro')
best_model = clf.fit(train_x_keep,train_y_keep)
print('Best Alpha:', best_model.best_estimator_.get_params()['alpha'])
print('Best Layer:', best_model.best_estimator_.get_params()['hidden_layer_sizes'])
print("Model accuracy on train data: {:.2f}%".format(best_model.score(train_x_keep, train_y_keep)*100))
print("Model accuracy on test data: {:.2f}%".format(best_model.score(test_x_keep, test_y_keep)*100))

y_pred = best_model.predict(test_x_keep)
print("\n\n\t\tclassification_report on test data\n")
print(classification_report(test_y_keep, y_pred))
print("Confusion matrix\n")
print(confusion_matrix(test_y_keep, y_pred))
print()
print("F1-Micro: ", f1_score(test_y_keep, y_pred, average='micro'))
print("Done")

Best Alpha: 0.01
Best Layer: (96, 40, 20, 10, 5)
Model accuracy on train data: 68.11%
Model accuracy on test data: 68.09%


		classification_report on test data

              precision    recall  f1-score   support

           1       0.60      0.38      0.47      4982
           2       0.70      0.79      0.74     29839
           3       0.66      0.58      0.62     17300

    accuracy                           0.68     52121
   macro avg       0.65      0.58      0.61     52121
weighted avg       0.68      0.68      0.67     52121

Confusion matrix

[[ 1917  2982    83]
 [ 1220 23525  5094]
 [   79  7173 10048]]

F1-Micro:  0.6809155618656587
Done


In [10]:
clf = MLPClassifier(solver='sgd', alpha= 0.01, hidden_layer_sizes= (96, 40, 20, 10, 5), max_iter=5000, activation='tanh')
clf.fit(train_x_keep, train_y_keep)

MLPClassifier(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(96, 40, 20, 10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=5000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [28]:
test_pred = clf.predict(test_values_keep)

In [35]:
#stores trained optimised model
pickle.dump(clf, open('model/mlp.pickle', 'wb'))

In [29]:
df = pd.DataFrame({'building_id': test_building_id, 'damage_grade': np.int64(test_pred)})

In [30]:
df.to_csv('data/submission.csv', index=False)

## Validation Results:

### Baseline, features removed, unsampled: 0.667
### Baseline, features removed, oversampled: 0.593

### Baseline, features not removed, unsampled: 0.676

### Optimised, features not removed, unsampled: 0.681


## Test Results: 0.689