In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [3]:
train_data = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_values.csv")

In [4]:
#preprocess data
train_data.isna().sum() #no missing values

building_id                               0
geo_level_1_id                            0
geo_level_2_id                            0
geo_level_3_id                            0
count_floors_pre_eq                       0
age                                       0
area_percentage                           0
height_percentage                         0
land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo       

In [5]:
#create new variable to hold all train data merged with the labels aka add damage_grade to train_data
building_damage = train_data.merge(train_labels,how='inner',on='building_id')

In [6]:
#drop unnecessary attributes
building_damage = building_damage.drop(columns="has_secondary_use")              #  included among other has_secondary_use
building_damage = building_damage.drop(columns="has_secondary_use_institution")  #  244 1's
building_damage = building_damage.drop(columns="has_secondary_use_school")       #   94 1's
building_damage = building_damage.drop(columns="has_secondary_use_industry")     #  279 1's
building_damage = building_damage.drop(columns="has_secondary_use_health_post")  #   49 1's
building_damage = building_damage.drop(columns="has_secondary_use_gov_office")   #   38 1's
building_damage = building_damage.drop(columns="has_secondary_use_use_police")   #   23 1's
building_damage = building_damage.drop(columns="has_secondary_use_other")        #  other has no clear relationship with damage_grade
building_damage = building_damage.drop(columns="has_superstructure_other")

In [7]:
#create subset for instances less than 250 years old
building_damage = building_damage[building_damage['age'] <= 250]

In [8]:
age_category = np.zeros(shape=(len(building_damage), 1))
area_category = np.zeros(shape=(len(building_damage), 1))
height_category = np.zeros(shape=(len(building_damage), 1))
j = 0
for i in building_damage.index:

    # age
    if building_damage.age[i] < 10:
        age_category[j] = 0  # 0 - 9
    elif building_damage.age[i] < 20:
        age_category[j] = 1  # 10 - 19
    elif building_damage.age[i] < 50:
        age_category[j] = 2  # 20 - 49
    elif building_damage.age[i] < 75:
        age_category[j] = 3  # 50 - 74
    elif building_damage.age[i] < 125:
        age_category[j] = 4  # 75 - 124
    else:
        age_category[j] = 5  # 125+

    # area_percentage
    if building_damage.area_percentage[i] < 15:
        area_category[j] = 0  # 0 - 14
    elif building_damage.area_percentage[i] < 32:
        area_category[j] = 1  # 15 - 32
    elif building_damage.area_percentage[i] < 42:
        area_category[j] = 2  # 33 - 41
    elif building_damage.area_percentage[i] < 52:
        area_category[j] = 3  # 42 - 51
    elif building_damage.area_percentage[i] < 62:
        area_category[j] = 4  # 52 - 61
    else:
        area_category[j] = 5  # 62 - 100

    # area percentage
    if building_damage.height_percentage[i] < 25:
        height_category[j] = 0  # 0 - 24
    else:
        height_category[j] = 1  # 25+

    j = j + 1

building_damage.loc[:, "age_category"] = age_category
building_damage.loc[:, "area_category"] = area_category
building_damage.loc[:, "height_category"] = height_category

building_damage = building_damage.drop(columns="age")
building_damage = building_damage.drop(columns="area_percentage")
building_damage = building_damage.drop(columns="height_percentage")

In [9]:
#how many instances with each damage grade
building_damage['damage_grade'].value_counts()

2    147437
3     86829
1     24945
Name: damage_grade, dtype: int64

In [10]:
#categorical features
cat_feats = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position','legal_ownership_status',
       'plan_configuration']

In [11]:
building_damage['damage_grade'] = building_damage['damage_grade'].astype('category')

In [12]:
#get rid of dummies instances
train_final = pd.get_dummies(building_damage, columns=cat_feats,drop_first=True)
test_final = pd.get_dummies(test_labels,columns=cat_feats,drop_first=True)

In [13]:
#list of all attributes in test set
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 61 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   building_id                             86868 non-null  int64
 1   geo_level_1_id                          86868 non-null  int64
 2   geo_level_2_id                          86868 non-null  int64
 3   geo_level_3_id                          86868 non-null  int64
 4   count_floors_pre_eq                     86868 non-null  int64
 5   age                                     86868 non-null  int64
 6   area_percentage                         86868 non-null  int64
 7   height_percentage                       86868 non-null  int64
 8   has_superstructure_adobe_mud            86868 non-null  int64
 9   has_superstructure_mud_mortar_stone     86868 non-null  int64
 10  has_superstructure_stone_flag           86868 non-null  int64
 11  has_superstruct

In [14]:
#split set between train and test
from sklearn.model_selection import train_test_split

In [15]:
#set y_train equal to the damage grade in training set
y_train=train_final.damage_grade
#set train equal to the rest of the attributes besides damage_grade in training set
train=train_final.drop('damage_grade',axis=1)

In [16]:
#set X equal to the rest of the attributes besides damage_grade in the training set
X = train_final.drop('damage_grade',axis=1)
#set y equal to the damage grade in training set
y = train_final['damage_grade']

In [17]:
#split training set into a training and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=5)

In [18]:
#Scale features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
#Model Preparation
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
#create model with Logistic Regression
logmodel = LogisticRegression(class_weight=None)
p=logmodel.fit(X_train,y_train)

In [21]:
#make predictions on model
predictions = logmodel.predict(X_test)

In [22]:
#classification report comparing test labels to prediction labels of damage grade
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.59      0.31      0.41      6252
           2       0.60      0.87      0.71     36960
           3       0.54      0.20      0.29     21591

    accuracy                           0.59     64803
   macro avg       0.58      0.46      0.47     64803
weighted avg       0.58      0.59      0.54     64803



In [23]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.5946946900606453
F1_score:  0.5946946900606453


In [24]:
#Model preparation using Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [25]:
#create decision tree and model
dtree = DecisionTreeClassifier()
y_train = list(y_train)
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [26]:
#make predictions of model
predictions = dtree.predict(X_test)

In [27]:
#get detailed report of results
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.47      0.48      0.47      6252
           2       0.70      0.69      0.70     36960
           3       0.60      0.61      0.60     21591

    accuracy                           0.64     64803
   macro avg       0.59      0.59      0.59     64803
weighted avg       0.64      0.64      0.64     64803



In [28]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.6427480209249572
F1_score:  0.6427480209249572


In [29]:
#RaindomForrest Model
from sklearn.ensemble import RandomForestClassifier

In [30]:
RF = RandomForestClassifier()

In [31]:
#setting up paramters for GridSearchCV
trees = [50, 55, 60,  65, 70, 75, 80, 85, 90, 95, 100]
param_grid = {'n_estimators': trees}
grid_search = GridSearchCV(RF, param_grid, cv=5)

In [32]:
#fit to model
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [50, 55, 60, 65, 70, 75, 80, 85, 90,
                                          95, 100]})

In [33]:
params = grid_search.best_params_
print(params)

{'n_estimators': 85}


In [34]:
rfc = RandomForestClassifier(n_estimators=100)

In [35]:
rfc.fit(X_train,y_train)

RandomForestClassifier()

In [36]:
predictions = rfc.predict(X_test)

In [37]:
#detailed report of results
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.64      0.46      0.54      6252
           2       0.72      0.81      0.76     36960
           3       0.70      0.59      0.64     21591

    accuracy                           0.71     64803
   macro avg       0.68      0.62      0.65     64803
weighted avg       0.70      0.71      0.70     64803



In [38]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.706171010601361
F1_score:  0.706171010601361
