In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [2]:
train_data = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_values.csv")

In [3]:
#preprocess data
train_data.isna().sum() #no missing values

building_id                               0
geo_level_1_id                            0
geo_level_2_id                            0
geo_level_3_id                            0
count_floors_pre_eq                       0
age                                       0
area_percentage                           0
height_percentage                         0
land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo       

In [4]:
#create new variable to hold all train data merged with the labels aka add damage_grade to train_data
building_damage = train_data.merge(train_labels,how='inner',on='building_id')

In [5]:
#drop unnecessary attributes
#building_damage = building_damage.drop(columns="has_secondary_use")
#building_damage = building_damage.drop(columns="has_secondary_use_agriculture")

In [6]:
#create subset for instances less than 250 years old
#building_damage = building_damage[building_damage['age'] <= 250]

In [7]:
#how many instances with each damage grade
building_damage['damage_grade'].value_counts()

2    148259
3     87218
1     25124
Name: damage_grade, dtype: int64

In [8]:
#categorical features
cat_feats = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position','legal_ownership_status',
       'plan_configuration']

In [9]:
building_damage['damage_grade'] = building_damage['damage_grade'].astype('category')

In [10]:
#get rid of dummies instances
train_final = pd.get_dummies(building_damage, columns=cat_feats,drop_first=True)
test_final = pd.get_dummies(test_labels,columns=cat_feats,drop_first=True)

In [11]:
#list of all attributes in test set
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 61 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   building_id                             86868 non-null  int64
 1   geo_level_1_id                          86868 non-null  int64
 2   geo_level_2_id                          86868 non-null  int64
 3   geo_level_3_id                          86868 non-null  int64
 4   count_floors_pre_eq                     86868 non-null  int64
 5   age                                     86868 non-null  int64
 6   area_percentage                         86868 non-null  int64
 7   height_percentage                       86868 non-null  int64
 8   has_superstructure_adobe_mud            86868 non-null  int64
 9   has_superstructure_mud_mortar_stone     86868 non-null  int64
 10  has_superstructure_stone_flag           86868 non-null  int64
 11  has_superstruct

In [12]:
#split set between train and test
from sklearn.model_selection import train_test_split

In [13]:
#set y_train equal to the damage grade in training set
y_train=train_final.damage_grade
#set train equal to the rest of the attributes besides damage_grade in training set
train=train_final.drop('damage_grade',axis=1)

In [14]:
#set X equal to the rest of the attributes besides damage_grade in the training set
X = train_final.drop('damage_grade',axis=1)
#set y equal to the damage grade in training set
y = train_final['damage_grade']

In [15]:
#split training set into a training and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=5)

In [16]:
#Scale features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
#Model Preparation
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [18]:
#create model with Logistic Regression
logmodel = LogisticRegression(class_weight=None)
p=logmodel.fit(X_train,y_train)

In [19]:
#make predictions on model
predictions = logmodel.predict(X_test)

In [20]:
#classification report comparing test labels to prediction labels of damage grade
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.56      0.29      0.39      6319
           2       0.60      0.88      0.71     36998
           3       0.55      0.20      0.29     21834

    accuracy                           0.59     65151
   macro avg       0.57      0.45      0.46     65151
weighted avg       0.58      0.59      0.54     65151



In [21]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.591103743610996
F1_score:  0.591103743610996


In [22]:
#Model preparation using Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [23]:
#create decision tree and model
dtree = DecisionTreeClassifier()
y_train = list(y_train)
dtree.fit(X_train,y_train)

In [24]:
#make predictions of model
predictions = dtree.predict(X_test)

In [25]:
#get detailed report of results
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.47      0.49      0.48      6319
           2       0.70      0.69      0.70     36998
           3       0.60      0.61      0.60     21834

    accuracy                           0.64     65151
   macro avg       0.59      0.60      0.59     65151
weighted avg       0.65      0.64      0.64     65151



In [26]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.6438581142269497
F1_score:  0.6438581142269497


In [27]:
#RaindomForrest Model
from sklearn.ensemble import RandomForestClassifier

In [28]:
RF = RandomForestClassifier()

In [29]:
#setting up paramters for GridSearchCV
trees = [50, 55, 60,  65, 70, 75, 80, 85, 90, 95, 100]
param_grid = {'n_estimators': trees}
grid_search = GridSearchCV(RF, param_grid, cv=5)

In [30]:
#fit to model
grid_search.fit(X_train,y_train)

In [31]:
params = grid_search.best_params_
print(params)

{'n_estimators': 100}


In [32]:
rfc = RandomForestClassifier(n_estimators=100)

In [33]:
rfc.fit(X_train,y_train)

In [34]:
predictions = rfc.predict(X_test)

In [35]:
#detailed report of results
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.66      0.46      0.54      6319
           2       0.72      0.84      0.77     36998
           3       0.73      0.58      0.65     21834

    accuracy                           0.72     65151
   macro avg       0.70      0.63      0.65     65151
weighted avg       0.72      0.72      0.71     65151



In [36]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.7171647403723658
F1_score:  0.7171647403723658
