<a href="https://colab.research.google.com/github/Nell87/drivendata_richter/blob/main/script/03_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **INCLUDES**


In [None]:
!pip uninstall matplotlib
!pip install matplotlib==3.1.3
!pip install pretty-confusion-matrix

In [2]:
####    0. INCLUDES  _______________________________________ #### 
#Loading Libraries:# 
import pandas as pd
import os
import numpy as np
import time   #  provides many ways of representing time in code

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

from sklearn.metrics import confusion_matrix
from pretty_confusion_matrix import pp_matrix_from_data
from sklearn.metrics import f1_score   # the score used in the competition

####   1. READING TRAIN AND TEST DATA _______________________________________ #### 
train_values= data = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/train_values.csv",index_col='building_id')
train_labels = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/train_labels.csv",index_col='building_id')
train_merge = train_values.merge(train_labels, on = 'building_id', how = 'inner',)
test = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/test_values.csv",index_col='building_id')

In [3]:
# Add labels
damage_map = {1:"Low", 2:"Medium", 3:"High"}
train_merge["damage_grade"] = train_merge["damage_grade"].map(damage_map)

# **MODELING: BASELINE RANDOM FOREST**
Let's train the baseline model: Random Forest. I'll obtain the best parameters through GridSearch using my own computer to speed up the process.

#### **Preprocessing: Dummify and split**

In [4]:
# Dummify
train_values_prep = pd.get_dummies(train_values, drop_first = True)

In [5]:
# Split in train/test
x_train, x_test, y_train, y_test = train_test_split(train_values_prep, train_labels, test_size = 0.2, random_state = 42)

#### **Modeling**

In [6]:
# Create the RF object
# rf_clf = RandomForestClassifier(random_state=314) 

# Best parameters (auto deprecated???) # 1141 seg (19 m)
#param_grid = { 
#     'n_estimators': [100, 200],
#    'max_features': ['none', 'auto', 'sqrt', 'log2'],
#    'max_depth' : [1,10],
#    'min_samples_leaf' : [10,20]
#}

#start_time = time.time()
#rf_clf_GS = GridSearchCV(rf_clf, param_grid, cv=5)
#rf_clf_GS.fit(x_train, y_train)
#dt_time_fit = time.time() - start_time
#rf_clf_GS.best_params_

# Train using the best parameters  # 12 seg
rf_clf_1 = RandomForestClassifier(random_state=314, n_estimators = 100,
                                  max_features = 'auto', max_depth = 10,
                                  min_samples_leaf = 20)
                                  
                                  
start_time = time.time()
rf_clf_1 = rf_clf_1.fit(x_train, y_train)  
rf_clf_1_time_fit = time.time() - start_time   

  rf_clf_1 = rf_clf_1.fit(x_train, y_train)


In [None]:
#Predictions to check # 0.631
pred_rf_clf_1 = rf_clf_1.predict(x_test)
f1_score(y_test,pred_rf_clf_1, average='micro')

# confusion matrix
confusion_matrix(y_test,pred_rf_clf_1)
pp_matrix_from_data(y_test, pred_rf_clf_1)


In [10]:
# Predictions to send # 0.6312 on competition (1469 / 5974)
test = pd.get_dummies(test, drop_first = True)
pred_rf_clf_1_final = rf_clf_1.predict(test)

my_submission = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/submission_format.csv",
                            index_col='building_id')
                            
my_submission = pd.DataFrame(data=pred_rf_clf_1_final,
                             columns=my_submission.columns,
                             index=my_submission.index)

my_submission.head()
# my_submission.to_csv('../data/submission_rf_clf_1.csv')

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


# **MODELING: SAME RANDOM FOREST + FEATURE ENGINEERING + PREPROCESSING**

### **PREPROCESSING**
- Feature engineering
- Removing outliers
-Dummify 
- Oversampling
- Split in train/test

#### **Preprocessing: Feature engineering**

In [None]:
# Function to replace a categorical feature with many values, with their conditional probabilities respecto to the predicted feature
def categoricalvalues_condprob(data, index, pred_feature, new_column_name):
  # Create prob table
  probs = data.groupby(index).size().div(len(data))
  probs_group = data.groupby([index, pred_feature]).size().div(len(data)).div(probs, axis=0, level=index).reset_index()
  probs_group.columns= [index, pred_feature, new_column_name]
  probs_group_wide = probs_group.pivot(index=[index], columns = pred_feature,values = new_column_name) #Reshape from long to wide
  probs_group_wide = probs_group_wide.reset_index()
  
 # Rename columns
  unique_values = np.unique(data[pred_feature])
  unique_values = -(len(unique_values))
  for i in range(unique_values,0):
    probs_group_wide.rename(columns={probs_group_wide.columns[i]: index + "_" + str(probs_group_wide.columns[i])}, inplace = True)
    
  # Add column to main dataset
  data_merge = data.merge(probs_group_wide, on=index, how='left')

  # Return dataset
  return data_merge

# Apply the function
train_merge_prep = categoricalvalues_condprob(train_merge, 'geo_level_1_id', 'damage_grade', 'prob_cond_geo_level_1')
train_merge_prep = categoricalvalues_condprob(train_merge_prep, 'geo_level_2_id', 'damage_grade', 'prob_cond_geo_level_2')
train_merge_prep = categoricalvalues_condprob(train_merge_prep, 'geo_level_3_id', 'damage_grade', 'prob_cond_geo_level_3')

# Get rid of the original categorical features
train_merge_prep = train_merge_prep.drop('geo_level_1_id', axis=1)
train_merge_prep = train_merge_prep.drop('geo_level_2_id', axis=1)
train_merge_prep = train_merge_prep.drop('geo_level_3_id', axis=1)

# Replace the missing values with 0
cols = ["geo_level_1_id_Low", "geo_level_1_id_Medium", "geo_level_1_id_High",
                         "geo_level_2_id_Low", "geo_level_2_id_Medium", "geo_level_2_id_High",
                         "geo_level_3_id_Low", "geo_level_3_id_Medium", "geo_level_3_id_High"]

train_merge_prep.fillna({"geo_level_1_id_Low":0, "geo_level_1_id_Medium":0, "geo_level_1_id_High":0,
                         "geo_level_2_id_Low":0, "geo_level_2_id_Medium":0, "geo_level_2_id_High":0,
                         "geo_level_3_id_Low":0, "geo_level_3_id_Medium":0, "geo_level_3_id_High":0}, inplace=True)

train_merge_prep

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,...,damage_grade,geo_level_1_id_High,geo_level_1_id_Low,geo_level_1_id_Medium,geo_level_2_id_High,geo_level_2_id_Low,geo_level_2_id_Medium,geo_level_3_id_High,geo_level_3_id_Low,geo_level_3_id_Medium
0,2,30,6,5,t,r,n,f,q,t,...,High,0.248185,0.086461,0.665354,0.744444,0.003704,0.251852,0.837838,0.000000,0.162162
1,2,10,8,7,o,r,n,x,q,s,...,Medium,0.519549,0.034277,0.446174,0.497487,0.010050,0.492462,0.125000,0.062500,0.812500
2,2,10,5,5,t,r,n,f,x,t,...,High,0.584996,0.021627,0.393378,0.601136,0.082386,0.316477,0.610294,0.029412,0.360294
3,2,10,6,5,t,r,n,f,x,s,...,Medium,0.130678,0.129718,0.739603,0.126829,0.019512,0.853659,0.129032,0.032258,0.838710
4,3,30,8,9,t,r,n,f,x,s,...,High,0.384672,0.046959,0.568370,0.378613,0.029865,0.591522,0.377049,0.008197,0.614754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,1,55,6,3,n,r,n,f,j,s,...,Medium,0.137269,0.083215,0.779516,0.103448,0.172414,0.724138,0.071429,0.000000,0.928571
260597,2,0,6,5,t,r,n,f,q,s,...,High,0.807546,0.013066,0.179388,0.934866,0.003831,0.061303,0.979592,0.000000,0.020408
260598,3,55,6,7,t,r,q,f,q,s,...,High,0.807546,0.013066,0.179388,0.918919,0.024024,0.057057,0.863636,0.045455,0.090909
260599,2,10,14,6,t,r,x,v,s,j,...,Medium,0.085872,0.354986,0.559142,0.039624,0.507429,0.452947,0.012712,0.220339,0.766949


#### **Preprocessing: Removing outliers**
- Get rid of buildings with more than 3 floors. 
- IQR based removal on age, area_percentage and height_percentage

In [None]:
# Removing outliers
def rem_outliers_IQR(data, feature):  
  # Finding the IQR
  percentile25 = data[feature].quantile(0.25)
  percentile75 = data[feature].quantile(0.75)
  iqr = percentile75 - percentile25

  # Finding upper and lower limit
  upper_limit = percentile75 + 1.5 * iqr
  lower_limit = percentile25 - 1.5 * iqr

  # Outliers removal
  data = data[data[feature] < upper_limit]
  data = data[data[feature] > lower_limit]
  
train_merge_prep = train_merge_prep[train_merge_prep['count_floors_pre_eq'] <= 3]
train_merge_prep = train_merge_prep[train_merge_prep['count_families'] > 3]
train_merge_prep = rem_outliers_IQR(train_merge_prep, 'age')
train_merge_prep = rem_outliers_IQR(train_merge_prep, 'area_percentage')
train_merge_prep = rem_outliers_IQR(train_merge_prep, 'height_percentage')

#### **Dummify**

In [None]:
# Dummify
train_merge_prep = pd.get_dummies(train_merge_prep.drop("damage_grade",1), drop_first = True)

  train_merge_dum = pd.get_dummies(train_merge.drop("damage_grade",1), drop_first = True)


#### **Oversampling**

In [None]:
# Oversampling
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_resample(train_merge_prep,train_labels)

#### **Split dataset**

In [None]:
# Split in train/test
x_train, x_test, y_train, y_test = train_test_split(os_features, os_labels, test_size = 0.2, random_state = 42)

### **MODELING**

In [None]:
# Create the RF object
rf_clf_2 = RandomForestClassifier(random_state=314)

# Train
start_time = time.time()
rf_clf_2 = rf_clf_2.fit(x_train, y_train)  
rf_clf_2_time_fit = time.time() - start_time    

In [None]:
#Predictions to check # 0.794
pred_rf_clf_2 = rf_clf_2.predict(x_test)
f1_score(y_test,pred_rf_clf_2, average='micro')

# confusion matrix
confusion_matrix(y_test,pred_rf_clf_2)
#pp_matrix_from_data(y_test, pred_rf_clf_2)

# Predictions to send # 0.6312 on competition (1469 / 5974)
test = pd.get_dummies(test, drop_first = True)
pred_rf_clf_2_final = rf_clf_1.predict(test)

my_submission = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/submission_format.csv",
                            index_col='building_id')
                            
my_submission = pd.DataFrame(data=pred_rf_clf_2_final,
                             columns=my_submission.columns,
                             index=my_submission.index)

my_submission.head()

In [None]:
#Predictions to send 
# Predictions to send # 0.6312 on competition (1469 / 5974)

# Prepare the test: Apply the function to replace the categorical feature 
train_merge1 = categoricalvalues_condprob(train_merge, 'geo_level_1_id', 'damage_grade', 'prob_cond_geo_level_1')
train_merge1_prob = train_merge1[["geo_level_1_id", "geo_level_1_id_High", "geo_level_1_id_Low", "geo_level_1_id_Medium"]]
train_merge1_prob = train_merge1_prob.drop_duplicates()      

train_merge2 = categoricalvalues_condprob(train_merge, 'geo_level_2_id', 'damage_grade', 'prob_cond_geo_level_2')
train_merge2_prob = train_merge2[["geo_level_2_id", "geo_level_2_id_High", "geo_level_2_id_Low", "geo_level_2_id_Medium"]]
train_merge2_prob = train_merge2_prob.drop_duplicates()      

train_merge3 = categoricalvalues_condprob(train_merge, 'geo_level_3_id', 'damage_grade', 'prob_cond_geo_level_3')
train_merge3_prob = train_merge3[["geo_level_3_id", "geo_level_3_id_High", "geo_level_3_id_Low", "geo_level_3_id_Medium"]]
train_merge3_prob = train_merge3_prob.drop_duplicates()     

# Add new columns to test dataset
test_prb = test_values.merge(train_merge1_prob,on="geo_level_1_id",  how='left')
test_prb = test_prb.merge(train_merge2_prob,on="geo_level_2_id",  how='left')
test_prb = test_prb.merge(train_merge3_prob,on="geo_level_3_id",  how='left')

# Replace the missing values with 0
test_prb.fillna({"geo_level_1_id_Low":0, "geo_level_1_id_Medium":0, "geo_level_1_id_High":0,
                         "geo_level_2_id_Low":0, "geo_level_2_id_Medium":0, "geo_level_2_id_High":0,
                         "geo_level_3_id_Low":0, "geo_level_3_id_Medium":0, "geo_level_3_id_High":0}, inplace=True)

# Get rid of the original categorical features
test_prb_prep = test_prb_prep.drop(["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"], axis=1)

# Dummify
test_prb_prep = pd.get_dummies(test_prb_prep, drop_first = True)

# Predicing
pred_rf_clf_2_final = rf_clf_2.predict(test_prb_prep)

my_submission2 = pd.read_csv("https://raw.githubusercontent.com/Nell87/drivendata_richter/main/data/submission_format.csv",
                            index_col='building_id')
                            
my_submission2 = pd.DataFrame(data=pred_rf_clf_2_final,
                             columns=my_submission2.columns,
                             index=my_submission2.index)

my_submission2.head()
# my_submission2.to_csv('submission_rf_clf_3.csv')