In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
%matplotlib inline

In [2]:
#import CSVs
train_data = pd.read_csv("train_values.csv",index_col='building_id')
train_labels = pd.read_csv("train_labels.csv", index_col='building_id')
test_labels = pd.read_csv("test_values.csv", index_col='building_id')

In [3]:
#create new variable to hold all train data merged with the labels aka add damage_grade to train_data
building_damage = train_data.merge(train_labels,how='inner',on='building_id')

In [4]:
#drop unnecessary attributes
building_damage = building_damage.drop(columns="has_secondary_use")
building_damage = building_damage.drop(columns="has_secondary_use_agriculture")
#this is new - JT
building_damage = building_damage.drop(columns="has_secondary_use_use_police")
building_damage = building_damage.drop(columns="has_secondary_use_rental")
building_damage = building_damage.drop(columns="has_secondary_use_school")
building_damage = building_damage.drop(columns="has_secondary_use_industry")
building_damage = building_damage.drop(columns="has_secondary_use_health_post")

In [5]:
#create subset for instances less than 250 years old
building_damage = building_damage[building_damage['age'] <= 250]

In [6]:
#categorical features
cat_feats = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position','legal_ownership_status',
       'plan_configuration']

In [7]:
#get rid of dummies instances
train_final = pd.get_dummies(building_damage, columns=cat_feats,drop_first=True)
test_final = pd.get_dummies(test_labels,columns=cat_feats,drop_first=True)

In [8]:
#set y_train equal to the damage grade in training set
y_train=train_final.damage_grade
#set train equal to the rest of the attributes besides damage_grade in training set
train=train_final.drop('damage_grade',axis=1)

In [9]:
#set X equal to the rest of the attributes besides damage_grade in the training set
X = train_final.drop('damage_grade',axis=1)
#set y equal to the damage grade in training set
y = train_final['damage_grade']

In [10]:
#split training set into a training and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=5)

In [11]:
#Scale features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
#RaindomForrest Model
RF = RandomForestClassifier(n_estimators=100) #n_estimators part is new

In [13]:
#setting up paramters for GridSearchCV
trees = [50, 55, 60,  65, 70, 75, 80, 85, 90, 95, 100]
param_grid = {'n_estimators': trees}
grid_search = GridSearchCV(RF, param_grid, cv=5)

In [14]:
#fit to grid search model
grid_search.fit(X_train,y_train)

In [15]:
#get best parameters from GridCV
params = grid_search.best_params_
print(params)

{'n_estimators': 100}


In [16]:
#build RandomForestClassifier model
rfc = RandomForestClassifier(n_estimators=100)

In [17]:
#fit to RandomForestClassifier model
rfc.fit(X_train,y_train)

In [18]:
#make predictions of model
predictions = rfc.predict(X_test)

In [19]:
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.65      0.47      0.55      6252
           2       0.72      0.83      0.77     36960
           3       0.71      0.60      0.65     21591

    accuracy                           0.72     64803
   macro avg       0.70      0.63      0.66     64803
weighted avg       0.71      0.72      0.71     64803



In [20]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.7153218215206086
F1_score:  0.7153218215206087


In [21]:
#clean test set
test_values_subset = test_labels.drop(columns='has_secondary_use')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_agriculture')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_use_police')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_rental')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_school')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_industry')
test_values_subset = test_values_subset.drop(columns='has_secondary_use_health_post')

#create subset for instances less than 250 years old
#test_values_subset = test_values_subset[test_values_subset['age'] <= 250] this reduces the amount of instances to which the submission format is concerned which causes errors

test_final = pd.get_dummies(test_values_subset,columns=cat_feats,drop_first=True)

In [22]:
#make predictions

test_predict = rfc.predict(test_final.values)
print(test_predict)

[3 3 3 ... 2 3 2]


In [23]:
#create submission
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
print(submission_format)

             damage_grade
building_id              
300051                  1
99355                   1
890251                  1
745817                  1
421793                  1
...                   ...
310028                  1
663567                  1
1049160                 1
442785                  1
501372                  1

[86868 rows x 1 columns]


In [24]:
my_submission = pd.DataFrame(data=test_predict,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [25]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,3
890251,3
745817,2
421793,2


In [26]:
my_submission.to_csv('submission.csv')