## 1. Import packages

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.encode_features import encode_features
from src.load_data import load_data
from src.compare_estimators import compare_estimators
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [47]:
pd.__version__

'2.1.4'

## 2. Load data

In [33]:
train_values, train_labels, test_values = load_data()

In [34]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


## 3. Features

### 3.1 Impute missing values
Not necessary

### 3.2 Create new features
Not necessary

### 3.3 Encoding

In [35]:
train_values_encoded = encode_features(df=train_values)

## 4. Cross validate across multiple models

In [36]:
y = train_labels['damage_grade']
X = train_values_encoded.copy()

In [37]:
print(y.shape)
print(X.shape)

(260601,)
(260601, 69)


In [8]:
estimators = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

compare_estimators(estimators, X, y)

Logistic Regression: Accuracy: 0.57 (+/- 0.00)
Decision Tree: Accuracy: 0.65 (+/- 0.01)
Random Forest: Accuracy: 0.72 (+/- 0.00)


## 5. Train the (best) model

### 5.1. Improve the model with grid search

In [38]:
# Define parameters grid for Random Forest
param_grid = {
    'max_depth': [None, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier()

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='f1_micro')

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best mean cross-validated score found
print("Best Mean Cross-validated Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Mean Cross-validated Score: 0.7190494280528471


In [40]:
forest = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=10
)
forest.fit(X,y)

## 6. Prepare test data
Because the test data is stil in the original form

In [41]:
test_values_encoded = encode_features(test_values)

## 7. Make Predictions

In [42]:
test_labels_prediction = forest.predict(test_values_encoded)

## 8. Bring predictions into the right format

In [43]:
submission = pd.DataFrame()
submission['building_id'] = test_values.building_id
submission['damage_grade'] = test_labels_prediction

submission.to_csv('submission/our_new_submission.csv',index=False)