In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Model preparation using Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
%matplotlib inline

In [44]:
train_data = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_values.csv")

In [45]:
#create new variable to hold all train data merged with the labels aka add damage_grade to train_data
building_damage = train_data.merge(train_labels,how='inner',on='building_id')

In [46]:
#drop unnecessary attributes
#kept has_secondary_use...agriculture, hotel, rental
building_damage = building_damage.drop(columns="has_secondary_use")              #  included among other has_secondary_use
building_damage = building_damage.drop(columns="has_secondary_use_institution")  #  244 1's
building_damage = building_damage.drop(columns="has_secondary_use_school")       #   94 1's
building_damage = building_damage.drop(columns="has_secondary_use_industry")     #  279 1's
building_damage = building_damage.drop(columns="has_secondary_use_health_post")  #   49 1's
building_damage = building_damage.drop(columns="has_secondary_use_gov_office")   #   38 1's
building_damage = building_damage.drop(columns="has_secondary_use_use_police")   #   23 1's
building_damage = building_damage.drop(columns="has_secondary_use_other")        #  other has no clear relationship with damage_grade
building_damage = building_damage.drop(columns="has_superstructure_other")

In [47]:
#create subset for instances less than 250 years old
building_damage = building_damage[building_damage['age'] <= 250]

In [48]:
age_category = np.zeros(shape=(len(building_damage), 1))
area_category = np.zeros(shape=(len(building_damage), 1))
height_category = np.zeros(shape=(len(building_damage), 1))
j = 0
for i in building_damage.index:

    # age
    if building_damage.age[i] < 10:
        age_category[j] = 0  # 0 - 9
    elif building_damage.age[i] < 20:
        age_category[j] = 1  # 10 - 19
    elif building_damage.age[i] < 50:
        age_category[j] = 2  # 20 - 49
    elif building_damage.age[i] < 75:
        age_category[j] = 3  # 50 - 74
    elif building_damage.age[i] < 125:
        age_category[j] = 4  # 75 - 124
    else:
        age_category[j] = 5  # 125+

    # area_percentage
    if building_damage.area_percentage[i] < 15:
        area_category[j] = 0  # 0 - 14
    elif building_damage.area_percentage[i] < 32:
        area_category[j] = 1  # 15 - 32
    elif building_damage.area_percentage[i] < 42:
        area_category[j] = 2  # 33 - 41
    elif building_damage.area_percentage[i] < 52:
        area_category[j] = 3  # 42 - 51
    elif building_damage.area_percentage[i] < 62:
        area_category[j] = 4  # 52 - 61
    else:
        area_category[j] = 5  # 62 - 100

    # area percentage
    if building_damage.height_percentage[i] < 25:
        height_category[j] = 0  # 0 - 24
    else:
        height_category[j] = 1  # 25+

    j = j + 1

building_damage.loc[:, "age_category"] = age_category
building_damage.loc[:, "area_category"] = area_category
building_damage.loc[:, "height_category"] = height_category

building_damage = building_damage.drop(columns="age")
building_damage = building_damage.drop(columns="area_percentage")
building_damage = building_damage.drop(columns="height_percentage")

In [49]:
#categorical features
cat_feats = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position','legal_ownership_status',
       'plan_configuration']

In [50]:
#get rid of dummies instances
train_final = pd.get_dummies(building_damage, columns=cat_feats,drop_first=True)
test_final = pd.get_dummies(test_labels,columns=cat_feats,drop_first=True)

In [51]:
#set y_train equal to the damage grade in training set
y_train=train_final.damage_grade
#set train equal to the rest of the attributes besides damage_grade in training set
train=train_final.drop('damage_grade',axis=1)

In [52]:
#set X equal to the rest of the attributes besides damage_grade in the training set
X = train_final.drop('damage_grade',axis=1)
#set y equal to the damage grade in training set
y = train_final['damage_grade']

In [53]:
#split training set into a training and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=5)

In [54]:
#Scale features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [55]:
#create decision tree and model
dtree = DecisionTreeClassifier()
y_train = list(y_train)
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [56]:
#make predictions of model
predictions = dtree.predict(X_test)

In [57]:
#get detailed report of results
report = classification_report(y_test,predictions)
print(report)

              precision    recall  f1-score   support

           1       0.48      0.49      0.48      5079
           2       0.71      0.69      0.70     29569
           3       0.60      0.61      0.60     17195

    accuracy                           0.65     51843
   macro avg       0.60      0.60      0.60     51843
weighted avg       0.65      0.65      0.65     51843



In [58]:
print('Accuracy: ', accuracy_score(y_test,predictions))
print('F1_score: ', f1_score(y_test,predictions, average='micro'))

Accuracy:  0.6466446771984646
F1_score:  0.6466446771984646


In [59]:
#clean test set
test_values_subset = test_labels.drop(columns="has_secondary_use")              #  included among other has_secondary_use
test_values_subset = test_values_subset.drop(columns="has_secondary_use_institution")  #  244 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_school")       #   94 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_industry")     #  279 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_health_post")  #   49 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_gov_office")   #   38 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_use_police")   #   23 1's
test_values_subset = test_values_subset.drop(columns="has_secondary_use_other")        #  other has no clear relationship with damage_grade
test_values_subset = test_values_subset.drop(columns="has_superstructure_other")

In [60]:
age_category_test = np.zeros(shape=(len(test_values_subset), 1))
area_category_test = np.zeros(shape=(len(test_values_subset), 1))
height_category_test = np.zeros(shape=(len(test_values_subset), 1))
j = 0
for i in test_values_subset.index:

    # age
    if test_values_subset.age[i] < 10:
        age_category_test[j] = 0  # 0 - 9
    elif test_values_subset.age[i] < 20:
        age_category_test[j] = 1  # 10 - 19
    elif test_values_subset.age[i] < 50:
        age_category_test[j] = 2  # 20 - 49
    elif test_values_subset.age[i] < 75:
        age_category_test[j] = 3  # 50 - 74
    elif test_values_subset.age[i] < 125:
        age_category_test[j] = 4  # 75 - 124
    else:
        age_category_test[j] = 5  # 125+

    # area_percentage
    if test_values_subset.area_percentage[i] < 15:
        area_category_test[j] = 0  # 0 - 14
    elif test_values_subset.area_percentage[i] < 32:
        area_category_test[j] = 1  # 15 - 32
    elif test_values_subset.area_percentage[i] < 42:
        area_category_test[j] = 2  # 33 - 41
    elif test_values_subset.area_percentage[i] < 52:
        area_category_test[j] = 3  # 42 - 51
    elif test_values_subset.area_percentage[i] < 62:
        area_category_test[j] = 4  # 52 - 61
    else:
        area_category[j] = 5  # 62 - 100

    # area percentage
    if test_values_subset.height_percentage[i] < 25:
        height_category[j] = 0  # 0 - 24
    else:
        height_category[j] = 1  # 25+

    j = j + 1

test_values_subset.loc[:, "age_category"] = age_category_test
test_values_subset.loc[:, "area_category"] = area_category_test
test_values_subset.loc[:, "height_category"] = height_category_test

test_values_subset = test_values_subset.drop(columns="age")
test_values_subset = test_values_subset.drop(columns="area_percentage")
test_values_subset = test_values_subset.drop(columns="height_percentage")

test_final = pd.get_dummies(test_values_subset,columns=cat_feats,drop_first=True)

In [61]:
#make predictions

test_predict = dtree.predict(test_final.values)
print(test_predict)

[3 3 3 ... 3 3 3]


In [62]:
#create submission
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
print(submission_format)

             damage_grade
building_id              
300051                  1
99355                   1
890251                  1
745817                  1
421793                  1
...                   ...
310028                  1
663567                  1
1049160                 1
442785                  1
501372                  1

[86868 rows x 1 columns]


In [63]:
my_submission = pd.DataFrame(data=test_predict,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [64]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,3
890251,3
745817,3
421793,3


In [65]:
my_submission.to_csv('submission.csv')