In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
from sklearn.model_selection import GridSearchCV, train_test_split,  ParameterGrid
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [18]:
import xgboost as xgb

In [19]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        encoded = pd.get_dummies(Xt)
        return encoded

In [20]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, items):
        self.items = items
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        dropped_data = Xt.drop(self.items, axis=1)
        return dropped_data

In [21]:
train_data = pd.read_csv("train_values.csv", delimiter=",")

train_label = pd.read_csv("train_labels.csv", delimiter=",")

test_data = pd.read_csv("test_values.csv", delimiter=",")

In [22]:
#Get half of samples
train_data = train_data.sample(frac=.30)
indices = train_data.index
train_label = train_label.iloc[indices]

In [23]:
train_data.shape

(78180, 39)

In [24]:
train_label.shape

(78180, 2)

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

In [26]:
y_train_labels = y_train['damage_grade']
y_valid_labels = y_valid['damage_grade']

In [27]:
param_grid = {"classifier__max_depth": [10, 15, 20],
                "classifier__learning_rate": [00.1, 0.1, 0.3],
                "classifier__n_estimators": [150, 200],
                 "classifier__colsample_bytree" : [0.5, 0.8]}

In [30]:
param_grid = {"classifier__max_depth": [10],
                "classifier__learning_rate": [00.1],
                "classifier__n_estimators": [200],
                 "classifier__colsample_bytree" : [0.8]}

In [28]:
drop_features = [
'building_id', 'has_secondary_use_gov_office', 
'has_secondary_use_use_police', 'has_secondary_use_institution','has_secondary_use_health_post',
'has_secondary_use_school','has_secondary_use_industry'
]

In [29]:
column_drop = ColumnDrop(drop_features)
encoder = Encoder()
classifier = xgb.XGBClassifier(n_jobs=-1, scale_pos_weight=1, objective = 'multi:softmax')
sca = StandardScaler()
pipe = Pipeline([("ColumnDrop", column_drop), ("Encoder", encoder), ("Scaler", sca), ("classifier", classifier)])
grid_search = GridSearchCV(pipe, param_grid, cv =3, n_jobs =-1, verbose = 10, refit=True)

In [30]:
grid_search.fit(X_train , y_train_labels)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 30.8min
[Parallel(n_jobs=-1)]: Done 104 out of 108 | elapsed: 39.7min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 41.2min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ColumnDrop',
                                        ColumnDrop(items=['building_id',
                                                          'has_secondary_use_gov_office',
                                                          'has_secondary_use_use_police',
                                                          'has_secondary_use_institution',
                                                          'has_secondary_use_health_post',
                                                          'has_secondary_use_school',
                                                          'has_secondary_use_industry'])),
                                       ('Encoder', Encoder()),
                                       ('Scaler',
                                        StandardScaler(...
                                                      scale_pos_weight=1,
       

In [33]:
grid_search.best_params_

{'classifier__colsample_bytree': 0.5,
 'classifier__learning_rate': 0.1,
 'classifier__max_depth': 10,
 'classifier__n_estimators': 200}

In [37]:
grid_search.cv_results_

{'mean_fit_time': array([ 78.93422437, 111.58667827, 145.19462768, 194.9435997 ,
        201.82831462, 234.56496239,  78.86312707,  98.33903607,
        116.34799727, 155.51829441, 164.74928641, 215.31811349,
         71.2192963 ,  94.68589997, 111.67947451, 147.12905105,
        152.98804712, 204.52097376, 107.80549908, 138.66202593,
        165.93212509, 221.64586163, 231.02047332, 311.66258152,
        105.02624933, 138.48383959, 173.81304987, 223.05742526,
        231.35358334, 308.21811152, 101.53126987, 134.08026544,
        163.57176773, 207.74797877, 210.15833855, 220.87729009]),
 'std_fit_time': array([ 0.81353448,  0.80294124,  4.34964476,  1.14754693,  0.74211259,
        13.43915885,  3.0563337 ,  2.85259124,  2.83867451,  3.04267029,
         2.44794108,  5.55913429,  0.81413556,  1.70861105,  1.21721519,
         1.99786118,  2.54132092,  2.00356314,  4.24316405,  1.27337292,
         2.0964013 ,  1.91630372,  4.20812895,  3.14366263,  1.23254571,
         4.84839798,  5.

In [34]:
grid_searched = grid_search.predict(X_valid)

score = f1_score(y_valid_labels, grid_searched, average='micro')

In [35]:
print(score)

0.7258889741621897


In [None]:
'''
max_f1_score = 0.0

pipelines = []
params = []
scores = []

for g in list(ParameterGrid(param_grid)):
    #add dicts
    params.append(g)
    
    column_drop = ColumnDrop(drop_features)
    encoder = Encoder()
    classifier = xgb.XGBClassifier(n_jobs=-1, scale_pos_weight=1, objective = 'multi:softmax')
    sca = StandardScaler()
    
    
    pipe = Pipeline([("ColumnDrop", column_drop), ("Encoder", encoder), ("Scaler", sca), ("classifier", classifier)])
    
    pipe.set_params(**g)
    print("Fitting begins.")
    pipe.fit(X_train, y_train_labels)
    print("Fitting completed.")
    #add pipelines
    pipelines.append(pipe)
    
    y_pred = pipe.predict(X_valid)
    score = f1_score(y_valid_labels, y_pred, average='micro')
    print(score)
    print("Finished.")
    #add scores
    scores.append(score)
'''