In [114]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [115]:
from sklearn.metrics import f1_score, make_scorer

In [116]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

In [117]:
import xgboost as xgb
from xgboost import plot_importance

In [118]:
train_data = pd.read_csv("train_values.csv", delimiter=",")
train_label = pd.read_csv("train_labels.csv", delimiter=",")

In [119]:
train_data = train_data.sample(frac=.30)
indices = train_data.index
train_label = train_label.iloc[indices]

In [120]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

In [121]:
y_fulltrain_labels = train_label['damage_grade']
y_train_labels = y_train['damage_grade']
y_valid_labels = y_valid['damage_grade']

In [122]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        encoded = pd.get_dummies(Xt)
        return encoded

In [123]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, items):
        self.items = items
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        dropped_data = Xt.drop(self.items, axis=1)
        return dropped_data

In [124]:
def f1_micro(y_train_true, y_train_pred):
    score = f1_score(y_train_true, y_train_pred, average='micro')
    return score

In [125]:
my_scorer = make_scorer(f1_micro)

In [126]:
drop_features = ['building_id']

In [128]:
column_drop = ColumnDrop(drop_features)
encoder = Encoder()
classifier = xgb.XGBClassifier(
                                objective = 'multi:softmax',
                                learning_rate = 0.3, 
                                n_estimators = 200, 
                                max_depth = 10, 
                                n_jobs=-1, 
                                scale_pos_weight=1,
                                subsample = 0.8,
                                min_child_weight=3)

rfe = RFE(estimator=classifier, verbose=1, step=2)

In [129]:
param_grid = {"rfe__n_features_to_select": [50, 40, 30]}

In [130]:
pipe = Pipeline([("ColumnDrop", column_drop), ("Encoder", encoder),("rfe", rfe), ("classifier", classifier)])

In [131]:
grid_search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=10, cv=3)

In [132]:
grid_search.fit(X_train, y_train_labels)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed: 31.7min remaining: 111.1min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed: 32.0min remaining: 64.1min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed: 40.8min remaining: 50.9min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed: 40.8min remaining: 32.7min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed: 41.0min remaining: 20.5min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 45.7min remaining: 13.0min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 63.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 63.3min finished


Fitting estimator with 68 features.
Fitting estimator with 66 features.
Fitting estimator with 64 features.
Fitting estimator with 62 features.
Fitting estimator with 60 features.
Fitting estimator with 58 features.
Fitting estimator with 56 features.
Fitting estimator with 54 features.
Fitting estimator with 52 features.
Fitting estimator with 50 features.
Fitting estimator with 48 features.
Fitting estimator with 46 features.
Fitting estimator with 44 features.
Fitting estimator with 42 features.


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ColumnDrop',
                                        ColumnDrop(items=['building_id'])),
                                       ('Encoder', Encoder()),
                                       ('rfe',
                                        RFE(estimator=XGBClassifier(base_score=0.5,
                                                                    booster='gbtree',
                                                                    colsample_bylevel=1,
                                                                    colsample_bynode=1,
                                                                    colsample_bytree=1,
                                                                    gamma=0,
                                                                    learning_rate=0.3,
                                                                    max_delta_st

In [150]:
grid_search.cv_results_

{'mean_fit_time': array([1898.33486811, 2445.73597829, 2459.94824847]),
 'std_fit_time': array([ 12.83594612,   5.696236  , 393.84555228]),
 'mean_score_time': array([4.80980992, 2.72139303, 1.84354051]),
 'std_score_time': array([0.21035046, 0.54410446, 0.25228091]),
 'param_rfe__n_features_to_select': masked_array(data=[50, 40, 30],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'rfe__n_features_to_select': 50},
  {'rfe__n_features_to_select': 40},
  {'rfe__n_features_to_select': 30}],
 'split0_test_score': array([0.70932464, 0.71105142, 0.70572717]),
 'split1_test_score': array([0.70961243, 0.71220261, 0.70462394]),
 'split2_test_score': array([0.71407329, 0.71287414, 0.70529547]),
 'mean_test_score': array([0.71100345, 0.71204272, 0.70521553]),
 'std_test_score': array([0.00217388, 0.00075266, 0.00045392]),
 'rank_test_score': array([2, 1, 3])}

In [138]:
rıfkı = grid_search.best_estimator_[2]

In [146]:
mrr = rıfkı.support_
np.count_nonzero(mrr)

40

In [147]:
from itertools import compress
df1 = column_drop.transform(X_train)
df1 = encoder.transform(df1)
features_list = df1.columns.to_list()
arr = rıfkı.support_
arr = np.logical_not(arr)

In [148]:
ans = list(compress(features_list, arr)) 
ans

['count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'land_surface_condition_n',
 'land_surface_condition_t',
 'foundation_type_u',
 'roof_type_n',
 'roof_type_q',
 'ground_floor_type_f',
 'ground_floor_type_m',
 'other_floor_type_s',
 'other_floor_type_x',
 'position_o',
 'position_s',
 'position_t',
 'plan_configuration_c',
 'plan_configuration_f',
 'plan_configuration_m',
 'plan_configuration_n',
 'plan_configuration_o',
 'plan_configuration_s']

In [149]:
print(len(ans))

28


In [52]:
grid_search.best_score_

0.7138334612432847