## 1. Import packages

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.encode_features import encode_features
from src.load_data import load_data
from src.cross_validate import compare_estimators
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

In [22]:
pd.__version__

'2.1.4'

## 2. Load data

In [23]:
train_values, train_labels, test_values = load_data()

In [24]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


## 3. Features

### 3.1 Impute missing values
Not necessary

### 3.2 Create new features
Not necessary

### 3.3 Encoding

In [25]:
train_values_encoded = encode_features(df=train_values)

### 3.4 Remove outliers
Tried it, then went back when it didn't make a difference

### 3.5 Class imbalance

In [26]:
# implemented stratifiedkfold

## 4. Cross validate across multiple models

In [27]:
y = train_labels['damage_grade']
X = train_values_encoded.copy()

In [28]:
print(y.shape)
print(X.shape)

(260601,)
(260601, 69)


In [None]:
estimators = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
]

compare_estimators(estimators, X, y)

Logistic Regression: Accuracy: 0.57 (+/- 0.00)
Decision Tree: Accuracy: 0.65 (+/- 0.01)
Random Forest: Accuracy: 0.72 (+/- 0.00)


## 5. Train the (best) model

### 5.1. Improve the model with grid search

In [39]:
# Define parameters grid for Random Forest
param_grid = {
    #'criterion':["gini", "entropy", "log_loss"],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [10, 100, 200, 500],
    #'max_depth': [None, 10, 20] Bad results
}

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_jobs=20)

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='f1_micro', verbose=3)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best mean cross-validated score found
print("Best Mean Cross-validated Score:", grid_search.best_score_)

with open('logs/logs.txt','w') as f:
    f.write(f"Best Parameters: {grid_search.best_params_}, \nBest Mean Cross-validated Score: {grid_search.best_score_}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.694 total time=   0.7s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.698 total time=   0.7s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=0.695 total time=   0.8s
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.717 total time=   5.4s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.719 total time=   5.4s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.719 total time=   6.4s
[CV 1/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.717 total time=  10.7s
[CV 2/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.720 total time=  10.6s
[CV 3/3] END min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.719 total time=  12.5s
[CV 1

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [11]:
forest = RandomForestClassifier(
    min_samples_leaf=1, min_samples_split=10, n_estimators=500, n_jobs=20
)
forest.fit(X,y)

## 6. Prepare test data
Because the test data is stil in the original form

In [12]:
test_values_encoded = encode_features(test_values)

## 7. Make Predictions

In [13]:
test_labels_prediction = forest.predict(test_values_encoded)

In [None]:
submission = pd.DataFrame()
submission['building_id'] = test_values.building_id
submission['damage_grade'] = test_labels_prediction

filename = 'submission_4_rfc_n500'
submission.to_csv(f'submission/{filename}',index=False)