## 1. Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.encode_features import encode_categorical_features
from src.load_data import load_data
from src.cross_validate import compare_estimators
from src.catboost_with_split import cat_boost_classifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier

In [2]:
pd.__version__
pd.set_option('display.max_rows',100)

## 2. Load data

In [3]:
train_values, train_labels, test_values = load_data()

In [4]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


## 3. Features

### 3.1 Impute missing values
Not necessary

### 3.2 Create new features
Not necessary

### 3.3 Encoding

In [5]:
non_numerical_columns = [
        "geo_level_1_id",
        "land_surface_condition",
        "foundation_type",
        "roof_type",
        "ground_floor_type",
        "other_floor_type",
        "position",
        "plan_configuration",
        "legal_ownership_status"]

In [6]:
train_values_encoded = encode_categorical_features(df=train_values, non_numerical_columns=non_numerical_columns)

In [7]:
train_values_encoded.head()

Unnamed: 0,geo_level_1_id,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,6,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
1,8,False,True,False,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
2,21,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
3,22,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0
4,11,False,False,True,False,False,True,False,False,True,...,0,0,0,0,0,0,0,0,0,0


In [8]:
geo_1_encoded = pd.get_dummies(train_values_encoded['geo_level_1_id'], prefix='geo_1_')

train_values_encoded = pd.concat(
    [train_values_encoded.drop(
        [
            'geo_level_1_id',
            #'geo_level_2_id',
            #'geo_level_3_id'
        ],
        axis=1
    ),
     geo_1_encoded],
    axis=1
)

In [9]:
train_values_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 99 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   land_surface_condition_n                260601 non-null  bool 
 1   land_surface_condition_o                260601 non-null  bool 
 2   land_surface_condition_t                260601 non-null  bool 
 3   foundation_type_h                       260601 non-null  bool 
 4   foundation_type_i                       260601 non-null  bool 
 5   foundation_type_r                       260601 non-null  bool 
 6   foundation_type_u                       260601 non-null  bool 
 7   foundation_type_w                       260601 non-null  bool 
 8   roof_type_n                             260601 non-null  bool 
 9   roof_type_q                             260601 non-null  bool 
 10  roof_type_x                             260601 non-null  bool 
 11  

In [10]:
#pd.DataFrame(pd.concat([train_values_encoded,train_labels],axis=1).corr()['damage_grade'].sort_values(ascending=False)).head(100)

### 3.4 Remove outliers
Tried it, then went back when it didn't make a difference

### 3.5 Class imbalance

In [11]:
# implemented stratifiedkfold

## 4. Cross validate across multiple models

In [12]:
y = train_labels['damage_grade']
X = train_values_encoded.drop('building_id',axis=1).copy()

In [13]:
print(y.shape)
print(X.shape)

(260601,)
(260601, 98)


In [None]:
estimators = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    #('Gradient Boosting Classifier', GradientBoostingClassifier()),
    #('AdaBoost', AdaBoostClassifier()),
    #('CatBoost', CatBoostClassifier())
]

compare_estimators(estimators, X, y)

## 5. Train the (best) model

### 5.1. Improve the model with grid search

In [None]:
# Define parameters grid for Random Forest
param_grid = {
    #'criterion':["gini", "entropy", "log_loss"],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [10, 100, 200, 500],
    #'max_depth': [None, 10, 20] Bad results
}

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_jobs=20)

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='f1_micro', verbose=3)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best mean cross-validated score found
print("Best Mean Cross-validated Score:", grid_search.best_score_)

with open('logs/logs.txt','w') as f:
    f.write(f"Best Parameters: {grid_search.best_params_}, \nBest Mean Cross-validated Score: {grid_search.best_score_}")

In [40]:
forest = RandomForestClassifier(
    min_samples_leaf=1, min_samples_split=10, n_estimators=500, n_jobs=20
)
forest.fit(X,y)

## 6. Prepare test data
Because the test data is stil in the original form

In [41]:
test_values_encoded = encode_categorical_features(test_values, non_numerical_columns=non_numerical_columns)

In [42]:
geo_1_test_encoded = pd.get_dummies(test_values_encoded['geo_level_1_id'], prefix='geo_1_')

test_values_encoded = pd.concat(
    [test_values_encoded.drop(
        [
            'geo_level_1_id',
            #'geo_level_2_id',
            #'geo_level_3_id'
        ],
        axis=1
    ),
     geo_1_test_encoded],
    axis=1
)

## 7. Make Predictions

In [43]:
test_labels_prediction = forest.predict(test_values_encoded.drop('building_id',axis=1))

In [44]:
submission = pd.DataFrame()
submission['building_id'] = test_values.building_id
submission['damage_grade'] = test_labels_prediction

filename = 'submission_5_forest_n500'
submission.to_csv(f'submission/{filename}.csv',index=False)