# Richter Predictor

Library imports

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from category_encoders import OrdinalEncoder

# I. Wrangle Data

In [38]:
X = pd.read_csv('data/train_values.csv', index_col='building_id')
y = pd.read_csv('data/train_labels.csv', index_col='building_id')['damage_grade']

X_test = pd.read_csv('data/test_values.csv', index_col='building_id')

In [11]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


# II. Split Data

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# III. Establish Baseline

In [17]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

Baseline Accuracy: 0.5697045280122793


In [20]:
print('Baseline F1:', f1_score(y_train, [2]*len(y_train), average='micro'))

Baseline F1: 0.5697045280122793


# IV. Build Model

In [24]:
model = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42,
                           n_jobs=-1)
)

model.fit(X_train, y_train);

# V. Check Metrics

In [28]:
print('Training Accuracy:', model.score(X_train, y_train))
print('Validation Accuracy:', model.score(X_val, y_val))
print('Validation F1 Score:', f1_score(y_val, model.predict(X_val), average='micro'))

Training Accuracy: 0.9867709132770529
Validation Accuracy: 0.7200168837896433
Validation F1 Score: 0.7200168837896433


# VI. Make Submission

In [42]:
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M_')
y_pred = model.predict(X_test)
submission = pd.DataFrame({'damage_grade':y_pred}, index=X_test.index)
submission.to_csv(f'submissions/{timestamp}submission.csv')