In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
import xgboost as xgb
from xgboost import plot_importance

In [7]:
train_data = pd.read_csv("train_values.csv", delimiter=",")
train_label = pd.read_csv("train_labels.csv", delimiter=",")
test_data = pd.read_csv("test_values.csv", delimiter=",")

In [8]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        encoded = pd.get_dummies(Xt)
        return encoded

In [15]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, items):
        self.items = items
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        dropped_data = Xt.drop(self.items, axis=1)
        return dropped_data

In [17]:
drop_features = [
'building_id', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 
'has_secondary_use_institution','has_secondary_use_health_post',
'has_secondary_use_school','has_secondary_use_industry'
]

In [18]:
encoder = Encoder()
column_drop = ColumnDrop(drop_features)

train_data = column_drop.transform(train_data)
test_data  = column_drop.transform(test_data)

train_data = encoder.transform(train_data)
test_data  = encoder.transform(test_data)

In [19]:
y_fulltrain_labels = train_label['damage_grade']
y_train_labels = y_train['damage_grade']
y_valid_labels = y_valid['damage_grade']

In [20]:
dmatrix_train = xgb.DMatrix(X_train, label = y_train_labels)
dmatrix_valid = xgb.DMatrix(X_valid, label = y_valid_labels)

In [29]:
dmatrix_train.get_label()

array([2., 2., 3., ..., 2., 3., 2.], dtype=float32)

In [41]:
def calc_f1(y_pred, dtrain):
    y_true = dtrain.get_label()
    score = f1_score(y_true, y_pred, average='micro')
    return ('f1_micro', score)

In [55]:
model = xgb.XGBClassifier(
                          objective = 'multi:softmax',
                          learning_rate = 0.3, 
                          #n_estimators = 200, 
                          max_depth = 10, 
                          n_jobs=-1, 
                          scale_pos_weight=1,
                          subsample = 1,
                          min_child_weight=3,
                          num_class = 3)

In [56]:
xgb_param = model.get_xgb_params()

In [59]:
cv_results = xgb.cv(
                    dtrain=dmatrix_train, 
                    params=xgb_param, 
                    nfold=3,
                    num_boost_round=200, 
                    early_stopping_rounds=10,
                    feval=calc_f1,
                    as_pandas=True, 
                    seed=123)

In [60]:
cv_results.head()

Unnamed: 0,train-f1_micro-mean,train-f1_micro-std,train-merror-mean,train-merror-std,test-f1_micro-mean,test-f1_micro-std,test-merror-mean,test-merror-std
0,0.702713,0.000995,0.297287,0.000995,0.689553,0.001551,0.310447,0.001551
1,0.708941,0.000458,0.291059,0.000458,0.694656,0.002233,0.305344,0.002233
2,0.714171,0.000753,0.285829,0.000753,0.698028,0.001093,0.301972,0.001093
3,0.717251,0.001378,0.282749,0.001378,0.700019,0.000997,0.299981,0.000997
4,0.72147,0.004351,0.27853,0.004351,0.702475,0.001303,0.297525,0.001303


In [62]:
xgb.train?

In [68]:
xgb_clf = xgb.train(
                    dtrain=dmatrix_train, 
                    params=xgb_param, 
                    num_boost_round=200)

In [76]:
preds = xgb_clf.predict(dmatrix_valid)
preds.shape

(52121,)

In [77]:
calc_f1(preds, dmatrix_valid)

('f1_micro', 0.7393564973810939)