In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

In [3]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [4]:
labels = pd.read_csv('../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [5]:
values = pd.read_csv('../csv/train_values.csv')
values.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500
geo_level_1_id,6,8,21,22,11,8,9,20,0,26
geo_level_2_id,487,900,363,418,131,558,475,323,757,886
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1
age,30,10,10,10,30,10,25,0,15,0
area_percentage,6,8,5,6,8,9,3,8,8,13
height_percentage,5,7,5,5,9,5,4,6,6,4
land_surface_condition,t,o,t,t,t,t,n,t,t,t
foundation_type,r,r,r,r,r,r,r,w,r,i


In [6]:
values.isnull().values.any()

False

In [7]:
labels.isnull().values.any()

False

In [8]:
values.dtypes 

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [9]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

In [10]:
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [11]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  int64   
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [12]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  int8    
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [13]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [14]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


# Nuevo Modelo

In [15]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,2


In [41]:

X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], test_size = 0.2, random_state = 123)

In [42]:
#OneHotEncoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    X_train = encode_and_bind(X_train, feature)
    X_test = encode_and_bind(X_test, feature)

In [18]:
# # Busco los mejores tres parametros indicados abajo.
# n_estimators = [100, 125, 150]
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 5]
# criterion = ['gini', 'entropy']

# hyperF = {'n_estimators': n_estimators,
#           'min_samples_split': min_samples_split,  
#           'min_samples_leaf': min_samples_leaf,
#           'criterion': criterion}

# gridF = GridSearchCV(estimator = RandomForestClassifier(random_state = 123),
#                      scoring = 'f1_micro',
#                      param_grid = hyperF,
#                      cv = 3,
#                      verbose = 1, 
#                      n_jobs = -1)

# bestF = gridF.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


 0.72244339 0.72320126 0.72347947        nan        nan        nan
 0.72014581 0.72019378 0.72029451 0.72036646 0.72028012 0.72035687
        nan        nan        nan 0.709646   0.70957405 0.70974193
 0.709646   0.70957405 0.70974193        nan        nan        nan
 0.71282137 0.71319071 0.71368476 0.72137375 0.72179585 0.72209324
        nan        nan        nan 0.71804009 0.71837585 0.71861569
 0.71806887 0.71849097 0.71869243        nan        nan        nan
 0.70832693 0.70845164 0.70842766 0.70832693 0.70845164 0.70842766]


In [19]:
pd.DataFrame(bestF.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.264995,0.017573,0.0,0.0,gini,1,1,100,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",,,,,,54
1,0.298359,0.022669,0.0,0.0,gini,1,1,125,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",,,,,,38
2,0.326656,0.02053,0.0,0.0,gini,1,1,150,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",,,,,,39
3,81.372372,1.475789,8.140713,0.04146,gini,1,2,100,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.715515,0.713252,0.714518,0.714428,0.000926,21
4,102.079629,0.588829,10.888718,0.712066,gini,1,2,125,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.716062,0.713568,0.714504,0.714711,0.001029,20
5,121.878676,2.120913,13.302495,0.392656,gini,1,2,150,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.716522,0.714619,0.714863,0.715335,0.000846,19
6,75.579581,1.840651,8.227596,0.304691,gini,1,5,100,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.723602,0.721771,0.721958,0.722443,0.000823,3
7,101.342332,1.804134,10.540398,0.780209,gini,1,5,125,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.723947,0.722044,0.723612,0.723201,0.00083,2
8,117.475579,5.044498,14.201746,0.366559,gini,1,5,150,"{'criterion': 'gini', 'min_samples_leaf': 1, '...",0.724278,0.722749,0.723411,0.723479,0.000626,1
9,0.394215,0.012087,0.0,0.0,gini,2,1,100,"{'criterion': 'gini', 'min_samples_leaf': 2, '...",,,,,,43


In [43]:
# Utilizo los mejores parametros segun el GridSearch
rf_model = RandomForestClassifier(n_estimators = 150,
                                  max_depth = None,
                                  max_features = 45,
                                  min_samples_split = 5,
                                  min_samples_leaf = 1,
                                  criterion = "gini",
                                  verbose=True)
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.5min finished


RandomForestClassifier(max_features=45, min_samples_split=5, n_estimators=150,
                       verbose=True)

In [44]:
rf_model.score(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    9.4s finished


0.951846699923254

In [45]:
# Calculo el F1 score para mi training set.
y_preds = rf_model.predict(X_test)
f1_score(y_test, y_preds, average='micro')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    2.5s finished


0.7348093858521517

In [23]:
# rf_model.feature_importances_

In [24]:
# test_values = pd.read_csv('../csv/test_values.csv', index_col = "building_id")
# test_values

In [25]:
# test_values_subset = test_values
# test_values_subset["geo_level_1_id"] = test_values_subset["geo_level_1_id"].astype("category")
# test_values_subset

In [26]:
# def encode_and_bind(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
#     return(res) 

# features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
#                      "position", "ground_floor_type", "other_floor_type",\
#                      "plan_configuration", "legal_ownership_status"]
# for feature in features_to_encode:
#     test_values_subset = encode_and_bind(test_values_subset, feature)
# test_values_subset

In [27]:
# Genero las predicciones para los test.
# preds = rf_model.predict(test_values_subset)

In [28]:
# submission_format = pd.read_csv('../csv/submission_format.csv', index_col = "building_id")

In [29]:
# my_submission = pd.DataFrame(data=preds,
#                              columns=submission_format.columns,
#                              index=submission_format.index)

In [30]:
# my_submission.head()

In [31]:
# my_submission.to_csv('../csv/predictions/jf-model-4-submission-all-params.csv')

In [32]:
# !head ../csv/predictions/jf-model-4-submission-all-params.csv