In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
%matplotlib inline

In [2]:
train_data = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_values.csv")

In [3]:
#create new variable to hold all train data merged with the labels aka add damage_grade to train_data
building_damage = train_data.merge(train_labels,how='inner',on='building_id')

In [4]:
#drop unnecessary attributes
building_damage = building_damage.drop(columns="has_secondary_use")
building_damage = building_damage.drop(columns="has_secondary_use_agriculture")
#this is new - JT
building_damage = building_damage.drop(columns="has_secondary_use_use_police")
building_damage = building_damage.drop(columns="has_secondary_use_rental")
building_damage = building_damage.drop(columns="has_secondary_use_school")
building_damage = building_damage.drop(columns="has_secondary_use_industry")
building_damage = building_damage.drop(columns="has_secondary_use_health_post")

In [5]:
#create subset for instances less than 250 years old
building_damage = building_damage[building_damage['age'] <= 250]

In [6]:
#categorical features
cat_feats = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position','legal_ownership_status',
       'plan_configuration']

In [7]:
#get rid of dummies instances
train_final = pd.get_dummies(building_damage, columns=cat_feats,drop_first=True)
test_final = pd.get_dummies(test_labels,columns=cat_feats,drop_first=True)

In [8]:
#set y_train equal to the damage grade in training set
y_train=train_final.damage_grade
#set train equal to the rest of the attributes besides damage_grade in training set
train=train_final.drop('damage_grade',axis=1)

In [9]:
#set X equal to the rest of the attributes besides damage_grade in the training set
X = train_final.drop('damage_grade',axis=1)
#set y equal to the damage grade in training set
y = train_final['damage_grade']

In [12]:
#split training set into a training and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=5)

In [13]:
#Scale features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
#create model with Logistic Regression
logmodel = LogisticRegression(class_weight=None)
p=logmodel.fit(X_train,y_train)

In [18]:
#make predictions on model
predictions = logmodel.predict(X_test)

In [19]:
#classification report comparing test labels to prediction labels of damage grade
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.59      0.31      0.40      6252
           2       0.60      0.87      0.71     36960
           3       0.55      0.20      0.30     21591

    accuracy                           0.59     64803
   macro avg       0.58      0.46      0.47     64803
weighted avg       0.58      0.59      0.54     64803



In [20]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.5948644352884898
F1_score:  0.5948644352884898
