In [40]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

## 1. Loading Data

In [41]:
df_x = pd.read_csv('train_values.csv')
df_y = pd.read_csv('train_labels.csv')

In [42]:
df = df_x.merge(df_y, left_on='building_id', right_on = 'building_id')

In [43]:
# Remove unwanted features
new_df = df.drop(['building_id'], axis = 1)

In [44]:
new_df['geo_level_1_id']= new_df['geo_level_1_id'].apply(str)
new_df['geo_level_2_id']= new_df['geo_level_2_id'].apply(str)
new_df['geo_level_3_id']= new_df['geo_level_3_id'].apply(str)

In [45]:
new_df.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3


In [46]:
from sklearn.model_selection import train_test_split

In [47]:
# Saving the features to 'X'
X = new_df.drop("damage_grade", axis=1)
# Saving target to 'y'
y = new_df["damage_grade"]

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [48]:
obj_cols = list(new_df.select_dtypes(include=['object']))
obj_cols

['geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [49]:
import category_encoders as ce

In [50]:
enc = ce.OrdinalEncoder(cols = obj_cols)
enc.fit(train_x, train_y)
ce_train_x = enc.transform(train_x)
ce_valid_x = enc.transform(valid_x)

In [60]:
ce_train_x

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
205517,1,1,1,2,5,16,5,1,1,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0
123626,2,2,2,2,30,9,5,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
168397,3,3,3,2,0,13,5,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0
210731,4,4,4,2,5,9,4,1,1,1,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
242940,3,5,5,2,15,10,3,2,1,1,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238779,7,8,68,3,20,10,8,1,1,2,1,1,2,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
246607,7,54,1285,4,5,14,12,1,2,3,2,2,2,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
190133,18,834,7106,2,35,8,5,2,1,1,1,4,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
101926,3,655,1438,2,10,7,5,2,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0


In [63]:
ce_train_x.describe()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
count,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0,182420.0
mean,10.404682,399.065689,3513.320393,2.129849,26.581022,8.014911,5.434278,1.20023,1.34329,1.361216,1.30535,1.84926,1.293707,1.072191,0.088329,0.761293,0.034207,0.018244,0.06832,0.0756,0.25449,0.085177,0.043329,0.015689,0.014768,1.085243,0.985007,0.112312,0.064187,0.034114,0.008245,0.000915,0.000395,0.001162,0.000164,0.000164,4.9e-05,0.004994
std,7.136172,304.60347,2629.761036,0.727722,74.02891,4.398382,1.920391,0.473139,0.857964,0.596475,0.670423,1.194826,0.602575,0.440598,0.283774,0.426295,0.18176,0.133832,0.252296,0.264358,0.435575,0.279146,0.203596,0.12427,0.120624,0.465324,0.419525,0.315751,0.245087,0.181521,0.090426,0.030243,0.019863,0.034071,0.012823,0.012823,0.007024,0.070492
min,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,134.0,1284.0,2.0,10.0,5.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,341.0,2997.0,2.0,15.0,7.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.0,615.0,5312.0,2.0,30.0,9.0,6.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,31.0,1401.0,11191.0,9.0,995.0,96.0,32.0,3.0,5.0,3.0,5.0,4.0,4.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
ce_valid_x.describe()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
count,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0,78181.0
mean,10.396349,402.958212,3576.541065,2.12943,26.427713,8.025377,5.434569,1.199972,1.342334,1.35877,1.303002,1.858725,1.292667,1.073534,0.089382,0.763434,0.034625,0.018214,0.067766,0.074494,0.256149,0.084624,0.040867,0.016257,0.01549,1.084535,0.981479,0.110871,0.064824,0.032489,0.007764,0.000998,0.000281,0.000857,0.000243,0.000102,0.000179,0.005411
std,7.142751,306.37002,2688.183577,0.727536,72.474534,4.377864,1.91382,0.473331,0.858495,0.593538,0.669925,1.201116,0.60221,0.457761,0.285297,0.424977,0.182829,0.133726,0.251345,0.262574,0.436508,0.278323,0.197983,0.126464,0.123491,0.463482,0.415718,0.313974,0.246217,0.177295,0.087772,0.031571,0.016773,0.029262,0.015587,0.010115,0.013381,0.073357
min,1.0,-1.0,-1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,138.0,1283.0,2.0,10.0,5.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,346.0,3057.0,2.0,15.0,7.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.0,620.0,5432.0,2.0,30.0,9.0,6.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,31.0,1399.0,11189.0,8.0,995.0,100.0,32.0,3.0,5.0,3.0,5.0,4.0,4.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 5. Modeling

In [51]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

In [52]:
LGBM_model = LGBMClassifier(
    objective='softmax', num_class=3, n_jobs=8, is_unbalance=True, n_estimators=188, 
    learning_rate=0.097, num_leaves = 1320, min_data_in_leaf = 120, max_depth=120,
    lambda_l1 = 30, min_gain_to_split = 0.346, bagging_fraction = 0.9, bagging_freq = 1,
    feature_fraction = 0.6, categorical_feature=[0,1,2,7,8,9,10,11,12,13,25])

LGBM_model.fit(ce_train_x, train_y)

pred = LGBM_model.predict(ce_valid_x)
pred_labels = np.rint(pred)
f1 = f1_score(y_test, pred_labels, average ='micro')

print("Test Set Scores:")
print(f"LightGBM score: {f1}")

Test Set Scores:
LightGBM score: 0.7478927105051099


In [54]:
ce_X = enc.transform(X)

final_model = LGBMClassifier(
    objective='softmax', num_class=3, n_jobs=8, is_unbalance=True, n_estimators=188, 
    learning_rate=0.097, num_leaves = 1320, min_data_in_leaf = 120, max_depth=120,
    lambda_l1 = 30, min_gain_to_split = 0.346, bagging_fraction = 0.9, bagging_freq = 1,
    feature_fraction = 0.6, categorical_feature=[0,1,2,7,8,9,10,11,12,13,25])

final_model.fit(ce_X, y)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=1,
               categorical_feature=[0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 25],
               feature_fraction=0.6, is_unbalance=True, lambda_l1=30,
               learning_rate=0.097, max_depth=120, min_data_in_leaf=120,
               min_gain_to_split=0.346, n_estimators=188, n_jobs=8, num_class=3,
               num_leaves=1320, objective='softmax')

## 7. Predicting

In [72]:
df2 = pd.read_csv('test_values.csv')

In [73]:
new_df2 = df2.drop(['building_id'], axis = 1)
new_df2['geo_level_1_id']= new_df2['geo_level_1_id'].apply(str)
new_df2['geo_level_2_id']= new_df2['geo_level_2_id'].apply(str)
new_df2['geo_level_3_id']= new_df2['geo_level_3_id'].apply(str)

In [74]:
new_df2.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [75]:
X_test = enc.transform(new_df2)

In [76]:
X_test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,8,679.0,2364.0,3,20,7,6,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
1,7,578.0,6676.0,2,25,13,5,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0
2,4,999.0,10461.0,2,5,4,5,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3,3,10.0,10.0,1,0,19,3,1,1,3,2,3,2,1,0,0,0,0,0,1,0,0,0,0,0,1,2,1,0,0,1,0,0,0,0,0,0,0
4,8,676.0,7745.0,3,15,8,7,1,1,2,1,1,2,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0


In [77]:
prediction = final_model.predict(X_test)

In [78]:
df_pred = pd.DataFrame(prediction, index = df2['building_id'], columns = ['damage_grade'])
df_pred = df_pred.reset_index(level=0)

In [79]:
df_pred

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3
...,...,...
86863,310028,2
86864,663567,3
86865,1049160,2
86866,442785,2


In [81]:
df_pred.to_csv('result1355.csv', index = False)