In [331]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from category_encoders import JamesSteinEncoder as catencoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow_addons as tfa
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from catboost import CatBoostClassifier
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [332]:
train = pd.read_csv('../Data/train_values.csv', index_col='building_id')
labels = pd.read_csv('../Data/train_labels.csv', index_col='building_id')

In [333]:
cat_cols = list(train.select_dtypes('object').columns)

bin_cols = [
    'has_superstructure_adobe_mud',
    'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag',
    'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick',
    'has_superstructure_timber',
    'has_superstructure_bamboo',
    'has_superstructure_rc_engineered',
    'has_superstructure_rc_non_engineered',
    'has_superstructure_other',
    'has_secondary_use',
    'has_secondary_use_agriculture',
    'has_secondary_use_hotel',
    'has_secondary_use_rental',
    'has_secondary_use_institution',
    'has_secondary_use_school',
    'has_secondary_use_industry',
    'has_secondary_use_health_post',
    'has_secondary_use_gov_office',
    'has_secondary_use_use_police',
    'has_secondary_use_other'
]

all_cat_cols = cat_cols + bin_cols

num_cols = list(train.select_dtypes('number').columns)
num_cols = [x for x in num_cols if x not in bin_cols]

In [334]:
ct = ColumnTransformer([
    ('cat', OrdinalEncoder(), cat_cols),
    #('num', StandardScaler(), num_cols)
], remainder='passthrough')

In [335]:
train_enc = ct.fit_transform(train)

In [336]:
label_enc = OneHotEncoder(sparse=False)
labels_encoded = label_enc.fit_transform(labels)

In [337]:
kbest = SelectKBest(chi2, k=30).fit_transform(train_enc, np.array(labels).ravel())

In [338]:
kbest_scaler = StandardScaler()
train_enc = kbest_scaler.fit_transform(kbest)

# Train Test Split

In [339]:
X_train, X_test, y_train, y_test = train_test_split(train_enc, labels_encoded, test_size=0.3, random_state=42, shuffle=True)

# Random Forest Classifier

In [340]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

RandomForestClassifier()

In [341]:
y_pred = model.predict(X_test)

In [342]:
accuracy_score(y_test, y_pred)

0.6955142553817424

# Neural Network

In [381]:
model = keras.Sequential([
    keras.layers.Dense(38, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

In [382]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_classes=3, average='micro'), 'accuracy'])

In [383]:
history = model.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=150,
    validation_data=(X_test, y_test),
    #callbacks=[h1n1_mc, EarlyStopping],
    shuffle=True,
    verbose=1
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150

KeyboardInterrupt: 

In [384]:
model.evaluate(X_test, y_test, verbose=1, return_dict=True)



{'loss': 0.6994526982307434,
 'f1_score': 0.6727849245071411,
 'accuracy': 0.6727849245071411}

array([[1.0350261e-02, 4.4137168e-01, 5.4827809e-01],
       [7.4194841e-02, 7.8618121e-01, 1.3962391e-01],
       [3.4702048e-02, 6.8229145e-01, 2.8300652e-01],
       ...,
       [6.0159025e-05, 2.3890583e-02, 9.7604918e-01],
       [5.0247245e-04, 4.1901100e-01, 5.8048654e-01],
       [4.8938678e-03, 3.8713560e-01, 6.0797060e-01]], dtype=float32)

# Test and Submission

In [111]:
test = pd.read_csv('../Data/test_values.csv', index_col='building_id')

In [113]:
test_enc = ct.transform(test)

In [114]:
y_predicted = model.predict(test_enc)

In [115]:
predicted_classes = np.argmax(y_predicted, axis=-1)

In [116]:
submission = pd.DataFrame(data=predicted_classes, index=test.index, columns=['damage_grade']).reset_index()
submission['damage_grade'] = submission['damage_grade'].map({
    0: 1,
    1: 2,
    2: 3
})

In [117]:
submission

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3
...,...,...
86863,310028,2
86864,663567,2
86865,1049160,2
86866,442785,2


In [118]:
today = str(datetime.today().date())
submission.to_csv(f'../Submissions/Submission {today}.csv', index=False)