In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from pprint import pprint
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier # rf분류기

In [2]:
def F1(y_pred, dtrain):
    labels = dtrain.get_label()
    
    pre = precision_score(y_true = labels, y_pred = y_pred, average=None)
    rec = recall_score(y_true = labels, y_pred = y_pred, average=None)
    f1_score = 8/(sum(1/pre) + sum(1/rec))

    return 'f1', f1_score

In [3]:
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)
    #print('='*50)
    return f1_score

In [18]:
X_train = pd.read_csv('X_train_487.csv')
X_test = pd.read_csv('X_test_487.csv')

In [5]:
#### load class
train_label = pd.read_csv('temp_data/train_label_lite.csv')
hasher = pd.read_csv('temp_data/test_id.csv')
label_map = {'retained':0,'2month':1,'month':2,'week':3}
inv_map = {label_map[k]:k for k in label_map.keys()}
y_train = pd.Series([label_map[l] for l in train_label.label])

---

In [6]:
X_train.shape

(100000, 487)

In [7]:
#### RF 모델
model = RandomForestClassifier(criterion='entropy',n_estimators=300,random_state= 7, n_jobs=-1)
X_train_rf = X_train.fillna(0)

In [8]:
#### cross validation
kfold = StratifiedKFold(n_splits = 10 ,random_state = 7).split(X_train_rf, y_train)
scores = []
for k, (train, test) in enumerate(kfold):
    model.fit(X_train_rf.iloc[train,:], y_train[train])
    score = f1(X_train_rf.iloc[test,:], y_train[test], model,label_map)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold: 1, Class dist.: [22500 22500 22500 22500], F1: 0.715
Fold: 2, Class dist.: [22500 22500 22500 22500], F1: 0.717
Fold: 3, Class dist.: [22500 22500 22500 22500], F1: 0.724
Fold: 4, Class dist.: [22500 22500 22500 22500], F1: 0.726
Fold: 5, Class dist.: [22500 22500 22500 22500], F1: 0.716
Fold: 6, Class dist.: [22500 22500 22500 22500], F1: 0.717
Fold: 7, Class dist.: [22500 22500 22500 22500], F1: 0.716
Fold: 8, Class dist.: [22500 22500 22500 22500], F1: 0.713
Fold: 9, Class dist.: [22500 22500 22500 22500], F1: 0.705
Fold: 10, Class dist.: [22500 22500 22500 22500], F1: 0.710

CV F1: 0.716 +/- 0.006


In [9]:
#### feature selection by RF
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)
indices = np.argsort(importances) # ascending

#### feature ranking
feature_ranking = [(indices[f],importances[indices[f]]) for f in range(X_train.shape[1])]

In [15]:
#### state of art feature.... 300개 정도... importance ratio 조정!!!
NUM_OF_FEATURES = len([(i,f) for i, f in feature_ranking if f > 0.0005])

In [16]:
NUM_OF_FEATURES

352

In [19]:
col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X_train.columns}).sort_values(by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values

In [20]:
#### FEATURE SELECTION
X_train = X_train[col]
# X_test = X_test[col]
X_train.shape

(100000, 352)

---

In [23]:
#### xgb
grid_result = []
param = {}
#### XGB parameters
## General Parameters
param['n_gpus'] = -1
param['tree_method'] = 'gpu_hist'
param['silent'] = 0

## Booster Parameters
param['n_estimators'] = 3000 #요기...
param['learning_rate'] = 0.01
param['min_child_weight'] = 2
param['max_depth'] = 10
param['gamma'] = 0
param['reg_alpha'] = 0.1
param['reg_lambda'] = 0.03
param['subsample'] = 0.9
param['colsample_bytree'] = 0.75
param['scale_pos_weight'] = 1

## Learning task parameters
param['num_class'] = 4
param['objective'] = 'multi:softmax'
param['seed'] = 7

model = xgb.XGBClassifier(**param)

In [9]:
 #### step 1 : tuning n_estimators with cross validation
print("===============================================")
print("Find the n_estimators")
xgtrain = xgb.DMatrix(X_train.values, label= y_train.values.reshape(-1,1))
cvresult = xgb.cv(param, xgtrain, num_boost_round = param['n_estimators'], nfold = 5, metrics = "mlogloss", early_stopping_rounds = 100)
print("Optimal n_estimators : %d"%cvresult.shape[0])

Find the n_estimators
Optimal n_estimators : 160


In [24]:
#### cross validation(for NA)
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7,shuffle=True).split(X_train, y_train)
scores = []
predict_set = []
for k, (train, test) in enumerate(kfold):
    model.fit(X_train.iloc[train,:], y_train[train], eval_metric = F1)
    score = model.score(X_train.iloc[test,:], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train[train]), score))
    ### predict
    y_pred = model.predict(X_train.iloc[test,:])
    predict_set += [(x,inv_map[y_pred[i]]) for i,x in enumerate(test)]
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold: 1, Class dist.: [20000 20000 20000 20000], F1: 0.743
Fold: 2, Class dist.: [20000 20000 20000 20000], F1: 0.741
Fold: 3, Class dist.: [20000 20000 20000 20000], F1: 0.747
Fold: 4, Class dist.: [20000 20000 20000 20000], F1: 0.747
Fold: 5, Class dist.: [20000 20000 20000 20000], F1: 0.737

CV F1: 0.743 +/- 0.004


In [24]:
#### 기존 state of art 학습!!!
model.fit(X_train,y_train,eval_metric = f1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.75, gamma=0, learning_rate=0.01,
       max_delta_step=0, max_depth=10, min_child_weight=2, missing=None,
       n_estimators=3000, n_gpus=-1, n_jobs=1, nthread=None, num_class=4,
       objective='multi:softprob', random_state=0, reg_alpha=0.1,
       reg_lambda=0.03, scale_pos_weight=1, seed=7, silent=0,
       subsample=0.9, tree_method='gpu_hist')

In [25]:
my_pred = model.predict(X_test)

In [26]:
#### result
results = [inv_map[x] for x in my_pred]
result = pd.DataFrame({'acc_id':['te' + str(x) for x in X_test.index], 'label': results})
result['acc_id'] = [hasher.iloc[i,0] for i in range(len(result.acc_id))]

In [27]:
result.to_csv('new_xgb_2018_09_04_4.csv',index = False)