## 1. Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.encode_features import encode_categorical_features
from src.load_data import load_data
from src.cross_validate import compare_estimators
from src.catboost_with_split import cat_boost_classifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier

In [2]:
pd.__version__
pd.set_option('display.max_rows',100)

## 2. Load data

In [3]:
train_values, train_labels, test_values = load_data()

In [4]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


## 3. Features

### 3.1 Impute missing values
Not necessary

### 3.2 Create new features
Not necessary

### 3.3 Encoding

In [5]:
non_numerical_columns = [
        "geo_level_1_id",
        "land_surface_condition",
        "foundation_type",
        "roof_type",
        "ground_floor_type",
        "other_floor_type",
        "position",
        "plan_configuration",
        "legal_ownership_status"]

In [6]:
train_values_encoded = encode_categorical_features(df=train_values, non_numerical_columns=non_numerical_columns)

In [7]:
train_values_encoded.head()

Unnamed: 0,geo_level_1_id,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,6,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
1,8,False,True,False,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
2,21,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
3,22,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
4,11,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0


In [9]:
geo_1_encoded = pd.get_dummies(train_values_encoded['geo_level_1_id'], prefix='geo_1_')

train_values_encoded = pd.concat(
    [train_values_encoded.drop(
        [
            'geo_level_1_id',
            #'geo_level_2_id',
            #'geo_level_3_id'
        ],
        axis=1
    ),
     geo_1_encoded],
    axis=1
)

In [10]:
train_values_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 99 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   land_surface_condition_n                260601 non-null  bool 
 1   land_surface_condition_o                260601 non-null  bool 
 2   land_surface_condition_t                260601 non-null  bool 
 3   foundation_type_h                       260601 non-null  bool 
 4   foundation_type_i                       260601 non-null  bool 
 5   foundation_type_r                       260601 non-null  bool 
 6   foundation_type_u                       260601 non-null  bool 
 7   foundation_type_w                       260601 non-null  bool 
 8   roof_type_n                             260601 non-null  bool 
 9   roof_type_q                             260601 non-null  bool 
 10  roof_type_x                             260601 non-null  bool 
 11  

In [11]:
#pd.DataFrame(pd.concat([train_values_encoded,train_labels],axis=1).corr()['damage_grade'].sort_values(ascending=False)).head(100)

### 3.4 Remove outliers
Tried it, then went back when it didn't make a difference

### 3.5 Class imbalance

In [12]:
# implemented stratifiedkfold

## 4. Cross validate across multiple models

In [13]:
y = train_labels['damage_grade']
X = train_values_encoded.drop('building_id',axis=1).copy()

In [14]:
print(y.shape)
print(X.shape)

(260601,)
(260601, 98)


In [16]:
estimators = [
    #('Logistic Regression', LogisticRegression()),
    #('Decision Tree', DecisionTreeClassifier()),
    #('Random Forest', RandomForestClassifier()),
    #('Gradient Boosting Classifier', GradientBoostingClassifier()),
    #('AdaBoost', AdaBoostClassifier()),
    ('CatBoost', CatBoostClassifier())
]

compare_estimators(estimators, X, y)

Learning rate set to 0.103895
Learning rate set to 0.103895
Learning rate set to 0.103895
Learning rate set to 0.103895
Learning rate set to 0.103895
0:	learn: 1.0433839	total: 137ms	remaining: 2m 17s
0:	learn: 1.0434267	total: 153ms	remaining: 2m 32s
0:	learn: 1.0433224	total: 165ms	remaining: 2m 45s
0:	learn: 1.0431325	total: 220ms	remaining: 3m 39s
1:	learn: 0.9993034	total: 256ms	remaining: 2m 7s
0:	learn: 1.0432983	total: 244ms	remaining: 4m 3s
1:	learn: 1.0004506	total: 270ms	remaining: 2m 14s
1:	learn: 0.9991666	total: 310ms	remaining: 2m 34s
1:	learn: 0.9992576	total: 315ms	remaining: 2m 37s
2:	learn: 0.9618134	total: 365ms	remaining: 2m 1s
2:	learn: 0.9616196	total: 378ms	remaining: 2m 5s
1:	learn: 0.9991169	total: 371ms	remaining: 3m 4s
2:	learn: 0.9611977	total: 412ms	remaining: 2m 16s
3:	learn: 0.9296270	total: 453ms	remaining: 1m 52s
2:	learn: 0.9612685	total: 455ms	remaining: 2m 31s
3:	learn: 0.9299881	total: 518ms	remaining: 2m 8s
3:	learn: 0.9298901	total: 537ms	remaini

## 5. Train the (best) model

### 5.1. Improve the model with grid search

In [25]:
# Define parameters grid for Random Forest
param_grid = {
    #'criterion':["gini", "entropy", "log_loss"],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [10, 100, 200, 500],
    #'max_depth': [None, 10, 20] Bad results
}

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_jobs=20)

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='f1_micro', verbose=3)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best mean cross-validated score found
print("Best Mean Cross-validated Score:", grid_search.best_score_)

with open('logs/logs.txt','w') as f:
    f.write(f"Best Parameters: {grid_search.best_params_}, \nBest Mean Cross-validated Score: {grid_search.best_score_}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.708 total time=   0.9s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.709 total time=   0.9s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.710 total time=   0.9s
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.725 total time=   6.8s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.727 total time=   6.8s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.727 total time=   7.9s
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.725 total time=  13.4s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.728 total time=  13.3s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.728 total time=  15.4s
[CV 1

In [54]:
forest = RandomForestClassifier(
    min_samples_leaf=1, min_samples_split=10, n_estimators=500, n_jobs=20
)
forest.fit(X,y)

In [17]:
catBoost = CatBoostClassifier()
catBoost.fit(X,y)

Learning rate set to 0.105084
0:	learn: 1.0427044	total: 75.2ms	remaining: 1m 15s
1:	learn: 0.9981770	total: 102ms	remaining: 50.7s
2:	learn: 0.9595102	total: 126ms	remaining: 42s
3:	learn: 0.9278611	total: 151ms	remaining: 37.6s
4:	learn: 0.9012000	total: 174ms	remaining: 34.7s
5:	learn: 0.8788587	total: 200ms	remaining: 33.1s
6:	learn: 0.8600483	total: 226ms	remaining: 32s
7:	learn: 0.8437876	total: 247ms	remaining: 30.7s
8:	learn: 0.8295454	total: 273ms	remaining: 30.1s
9:	learn: 0.8172678	total: 295ms	remaining: 29.2s
10:	learn: 0.8066130	total: 317ms	remaining: 28.5s
11:	learn: 0.7976551	total: 345ms	remaining: 28.4s
12:	learn: 0.7895200	total: 366ms	remaining: 27.8s
13:	learn: 0.7812731	total: 389ms	remaining: 27.4s
14:	learn: 0.7744402	total: 412ms	remaining: 27.1s
15:	learn: 0.7678726	total: 437ms	remaining: 26.9s
16:	learn: 0.7621100	total: 463ms	remaining: 26.8s
17:	learn: 0.7577341	total: 486ms	remaining: 26.5s
18:	learn: 0.7540398	total: 513ms	remaining: 26.5s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7b9077c26f60>

## 6. Prepare test data
Because the test data is stil in the original form

In [19]:
test_values_encoded = encode_categorical_features(test_values, non_numerical_columns=non_numerical_columns)

In [20]:
geo_1_test_encoded = pd.get_dummies(test_values_encoded['geo_level_1_id'], prefix='geo_1_')

test_values_encoded = pd.concat(
    [test_values_encoded.drop(
        [
            'geo_level_1_id',
            #'geo_level_2_id',
            #'geo_level_3_id'
        ],
        axis=1
    ),
     geo_1_test_encoded],
    axis=1
)

## 7. Make Predictions

In [21]:
test_labels_prediction = catBoost.predict(test_values_encoded.drop('building_id',axis=1))

In [24]:
submission = pd.DataFrame()
submission['building_id'] = test_values.building_id
submission['damage_grade'] = test_labels_prediction

filename = 'submission_4_catBoost_n500'
submission.to_csv(f'submission/{filename}.csv',index=False)

In [68]:
pd.concat([test_values,pd.DataFrame(test_labels_prediction,columns=["damage_grade"])],axis=1).groupby('damage_grade').sum()

Unnamed: 0_level_0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
damage_grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2828932794,98219,3417629,34042579,10010,63940,59766,29290,tnttntttnttttttttttotnttntttttttttttttttnntttt...,rrrruiiiiiwiiurwhwwrwrruiwrruuuriiuwiriiwuwrhr...,...,125,645,317,50,12,16,3,5,0,49
2,30943780758,775134,40985954,363796562,124466,1637550,468921,317122,ttttntttttttttttnntttttttttttotttttottttttottt...,rrrrrwrrrrrrurrrrruwrrrrrrrrrrrrwrrrrurirrrrri...,...,4314,2060,376,35,20,49,6,5,3,327
3,11974397028,333087,16762619,146107302,50826,604870,167465,125811,tttnttnttnttttttotntnttttnnttttttttttttttttntt...,rrrrrrrrrrrrhrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrhrr...,...,1129,237,23,3,0,22,0,0,1,50


## 8 Experimentation time

In [10]:
train_values_encoded.columns

Index(['land_surface_condition_n', 'land_surface_condition_o',
       'land_surface_condition_t', 'foundation_type_h', 'foundation_type_i',
       'foundation_type_r', 'foundation_type_u', 'foundation_type_w',
       'roof_type_n', 'roof_type_q', 'roof_type_x', 'ground_floor_type_f',
       'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x',
       'ground_floor_type_z', 'other_floor_type_j', 'other_floor_type_q',
       'other_floor_type_s', 'other_floor_type_x', 'position_j', 'position_o',
       'position_s', 'position_t', 'plan_configuration_a',
       'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f',
       'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o',
       'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
       'legal_ownership_status_a', 'legal_ownership_status_r',
       'legal_ownership_status_v', 'legal_ownership_status_w', 'building_id',
       'geo_level_1_id', 'geo_level_2_id', 'geo_l

In [15]:
train_values_without_secondary_use = train_values_encoded[[
    'land_surface_condition_n',
    'land_surface_condition_o',
    'land_surface_condition_t',
    'foundation_type_h',
    'foundation_type_i',
    'foundation_type_r',
    'foundation_type_u',
    'foundation_type_w',
    'roof_type_n',
    'roof_type_q',
    'roof_type_x',
    'ground_floor_type_f',
    'ground_floor_type_m',
    'ground_floor_type_v',
    'ground_floor_type_x',
    'ground_floor_type_z',
    'other_floor_type_j',
    'other_floor_type_q',
    'other_floor_type_s',
    'other_floor_type_x',
    'position_j',
    'position_o',
    'position_s',
    'position_t',
    'plan_configuration_a',
    'plan_configuration_c',
    'plan_configuration_d',
    'plan_configuration_f',
    'plan_configuration_m',
    'plan_configuration_n',
    'plan_configuration_o',
    'plan_configuration_q',
    'plan_configuration_s',
    'plan_configuration_u',
    'legal_ownership_status_a',
    'legal_ownership_status_r',
    'legal_ownership_status_v',
    'legal_ownership_status_w',
    'geo_level_1_id',
    'geo_level_2_id',
    #'geo_level_3_id',
    'count_floors_pre_eq',
    'age',
    'area_percentage',
    'height_percentage',
    'has_superstructure_adobe_mud',
    'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag',
    'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick',
    'has_superstructure_timber',
    'has_superstructure_bamboo',
    'has_superstructure_rc_non_engineered',
    'has_superstructure_rc_engineered',
    'has_superstructure_other',
    'count_families'
    ]]

In [16]:
estimators = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    #('Gradient Boosting Classifier', GradientBoostingClassifier()),
    #('AdaBoost', AdaBoostClassifier()),
    #('CatBoost', CatBoostClassifier())
]

compare_estimators(estimators, train_values_without_secondary_use, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression: Accuracy: 0.57 (+/- 0.00)
Decision Tree: Accuracy: 0.65 (+/- 0.01)
