In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#from sklearn.feature_selection import RFE
#from sklearn.svm import SVC
#from sklearn.multiclass import OneVsRestClassifier
#from sklearn.ensemble import BaggingClassifier

In [3]:
import xgboost as xgb
from xgboost import plot_importance

In [4]:
train_data = pd.read_csv("train_values.csv", delimiter=",")
train_label = pd.read_csv("train_labels.csv", delimiter=",")

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

In [6]:
y_fulltrain_labels = train_label['damage_grade']
y_train_labels = y_train['damage_grade']
y_valid_labels = y_valid['damage_grade']

In [7]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        encoded = pd.get_dummies(Xt)
        return encoded

In [8]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, items):
        self.items = items
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        dropped_data = Xt.drop(self.items, axis=1)
        return dropped_data

In [28]:
drop_features = [
'building_id',
'count_floors_pre_eq',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'land_surface_condition_n',
 'land_surface_condition_t',
 'foundation_type_u',
 'roof_type_n',
 'roof_type_q',
 'ground_floor_type_f',
 'ground_floor_type_m',
 'other_floor_type_s',
 'other_floor_type_x',
 'position_o',
 'position_s',
 'position_t',
 'plan_configuration_c',
 'plan_configuration_f',
 'plan_configuration_m',
 'plan_configuration_n',
 'plan_configuration_o',
 'plan_configuration_s']
#'age',
# 'area_percentage',
# 'height_percentage',



In [29]:
column_drop = ColumnDrop(drop_features)
encoder = Encoder()
sca = StandardScaler()
#pca = PCA(n_components= 35)
classifier = xgb.XGBClassifier(
                                objective = 'multi:softmax',
                                learning_rate = 0.3, 
                                n_estimators = 200, 
                                max_depth = 10, 
                                n_jobs=-1, 
                                scale_pos_weight=1,
                                subsample = 0.8,
                                min_child_weight=3)
                                #colsample_bytree = 0.3)

In [30]:
X_train = encoder.transform(X_train)
X_valid = encoder.transform(X_valid)

In [31]:
#("classifier", classifier)
#("Encoder", encoder)
#("rfe", selector)
pipe = Pipeline([("ColumnDrop", column_drop),("Scaler", sca), ("classifier", classifier)])

In [32]:
pipe.fit(X_train, y_train_labels)

Pipeline(memory=None,
         steps=[('ColumnDrop',
                 ColumnDrop(items=['building_id', 'count_floors_pre_eq',
                                   'has_secondary_use_rental',
                                   'has_secondary_use_institution',
                                   'has_secondary_use_school',
                                   'has_secondary_use_industry',
                                   'has_secondary_use_gov_office',
                                   'has_secondary_use_use_police',
                                   'land_surface_condition_n',
                                   'land_surface_condition_t',
                                   'foundation_type_u', 'ro...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.3,
                               max_delta_step=0, max_depth=10,
             

In [33]:
y_pred = pipe.predict(X_valid)
score = f1_score(y_valid_labels, y_pred, average='micro')

In [34]:
print(score)

0.7420233686997565


In [35]:
tmp = column_drop.transform(X_train)

tmp = encoder.transform(tmp)

for name, importance in zip(tmp.columns, pipe[2].feature_importances_):
    print(name, " - ",importance)

geo_level_1_id  -  0.06396525
geo_level_2_id  -  0.021107413
geo_level_3_id  -  0.013024981
age  -  0.010517519
area_percentage  -  0.009214648
height_percentage  -  0.010465817
has_superstructure_adobe_mud  -  0.015065025
has_superstructure_mud_mortar_stone  -  0.045093052
has_superstructure_stone_flag  -  0.027615191
has_superstructure_cement_mortar_stone  -  0.017488739
has_superstructure_mud_mortar_brick  -  0.02146729
has_superstructure_cement_mortar_brick  -  0.032480203
has_superstructure_timber  -  0.013861862
has_superstructure_bamboo  -  0.012748486
has_superstructure_rc_non_engineered  -  0.013679515
has_superstructure_rc_engineered  -  0.015307283
has_superstructure_other  -  0.017964084
count_families  -  0.013078364
has_secondary_use  -  0.0142412875
has_secondary_use_agriculture  -  0.013905179
has_secondary_use_hotel  -  0.010339242
has_secondary_use_health_post  -  0.0076694996
has_secondary_use_other  -  0.014826225
land_surface_condition_o  -  0.014875086
foundation_