## 1 Import relevant libraries

In [35]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Custom code
from src.load_data import load_data
from src.custom_pipe import transform_pipeline, training_pipeline
from src.finetuning import grid_search_best_estimator
from src.predictions import generate_predictions

## 2 Load data

In [36]:
train_values, train_labels, test_values = load_data()

In [37]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


## 3 Preliminary analysis

In [38]:
train_values.describe()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
count,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,...,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0
mean,525675.5,13.900353,701.074685,6257.876148,2.129723,26.535029,8.018051,5.434365,0.088645,0.761935,...,0.064378,0.033626,0.008101,0.00094,0.000361,0.001071,0.000188,0.000146,8.8e-05,0.005119
std,304545.0,8.033617,412.710734,3646.369645,0.727665,73.565937,4.392231,1.918418,0.284231,0.4259,...,0.245426,0.180265,0.089638,0.030647,0.018989,0.032703,0.013711,0.012075,0.009394,0.071364
min,4.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,261190.0,7.0,350.0,3073.0,2.0,10.0,5.0,4.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,525757.0,12.0,702.0,6270.0,2.0,15.0,7.0,5.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,789762.0,21.0,1050.0,9412.0,2.0,30.0,9.0,6.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1052934.0,30.0,1427.0,12567.0,9.0,995.0,100.0,32.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
train_labels.groupby('damage_grade').sum()
# We see that there is a class imbalance in the labels. Depending on our desired final estimator, we may have to address this.

Unnamed: 0_level_0,building_id
damage_grade,Unnamed: 1_level_1
1,13243898715
2,77810839245
3,45936818526


## 4 Feature engineering

In [41]:
# Split features into those we want to encode, those we want to drop, and those we want to keep as they are

columns_to_encode = [
    'geo_level_1_id',
    'land_surface_condition',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
    ]

columns_to_drop = [
    'building_id'
    ]

columns_to_keep_as_is = [
    'geo_level_2_id',
    'geo_level_3_id',
    'count_floors_pre_eq',
    'age',
    'area_percentage',
    'height_percentage',
    'has_superstructure_adobe_mud',
    'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag',
    'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick',
    'has_superstructure_timber',
    'has_superstructure_bamboo',
    'has_superstructure_rc_non_engineered',
    'has_superstructure_rc_engineered',
    'has_superstructure_other',
    'count_families',
    'has_secondary_use',
    'has_secondary_use_agriculture',
    'has_secondary_use_hotel',
    'has_secondary_use_rental',
    'has_secondary_use_institution',
    'has_secondary_use_school',
    'has_secondary_use_industry',
    'has_secondary_use_health_post',
    'has_secondary_use_gov_office',
    'has_secondary_use_use_police',
    'has_secondary_use_other'
    ]

num_to_encode = list(columns_to_encode[0])
cat_to_encode = columns_to_encode[1:]

In [42]:
X = pd.concat([train_values[columns_to_encode],train_values[columns_to_keep_as_is]],axis=1)
y = train_labels['damage_grade']

## 5 Pipeline

We start with three common classifiers for a problem like this: ```Logistic Regression```, ```Decision Tree Classifier``` and ```Random Forest Classifier```.

In [43]:
estimators = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier(n_jobs=-1))
]

The ```preprocessor``` transforms the previously defined columns into one-hot-encoded features.
The ```pipelines``` variables contains a dictionary with trained models. Scores are printed at the end of training.

In [24]:
preprocessor = transform_pipeline(numerical_columns_to_encode=num_to_encode, categorical_columns_to_encode=cat_to_encode)
pipelines = training_pipeline(estimators, preprocessor, X, y, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression: Accuracy: 0.57 (+/- 0.01)
DecisionTreeClassifier: Accuracy: 0.66 (+/- 0.00)
RandomForestClassifier: Accuracy: 0.72 (+/- 0.00)


## 6 Finetuning with grid search

In [25]:
param_grid = {
    'classifier__min_samples_split': [5, 10, 20],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__n_estimators': [10, 100, 500],
    }


In [26]:
best_estimator = grid_search_best_estimator(
    pipelines['RandomForestClassifier'],
    param_grid=param_grid,
    X=X,
    y=y,
    cv=2,
    scoring='f1_micro',
    verbose = 3
    )

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV 1/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=10;, score=0.705 total time=   0.9s
[CV 2/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=10;, score=0.707 total time=   1.0s
[CV 1/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=100;, score=0.721 total time=   5.2s
[CV 2/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=100;, score=0.722 total time=   5.4s
[CV 1/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200;, score=0.723 total time=  10.0s
[CV 2/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200;, score=0.724 total time=  10.1s
[CV 1/2] END classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimator

## 7 Generate predictions

In [27]:
generate_predictions(best_estimator, test_values, 'submission/new_predictions_alt')