In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from pprint import pprint
import matplotlib.pyplot as plt
from hyperopt.pyll.stochastic import sample
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.ensemble import RandomForestClassifier # rf분류기

In [2]:
def F1(y_pred, dtrain):
    labels = dtrain.get_label()
    
    pre = precision_score(y_true = labels, y_pred = y_pred, average=None)
    rec = recall_score(y_true = labels, y_pred = y_pred, average=None)
    f1_score = 8/(sum(1/pre) + sum(1/rec))

    return 'f1', f1_score

In [3]:
#### load data set
X_train = pd.read_csv('temp_data/X_train_base.csv')
# X_train2 = pd.read_csv('temp_data/train_activity_cnt_mean_encoded.csv').drop('new_id',axis=1)

In [4]:
cols = pd.read_csv('sub_features.csv').features/X_train = X_train.loc[:,cols]

In [5]:
#### load class
train_label = pd.read_csv('temp_data/train_label_lite.csv')
# hasher = pd.read_csv('test_id.csv')
label_map = {'retained':0,'2month':1,'month':2,'week':3}
y_train = pd.Series([label_map[l] for l in train_label.label])
inv_map = {label_map[k]:k for k in label_map.keys()}

In [7]:
X_train.shape

(100000, 459)

---

In [8]:
#### RF 모델
model = RandomForestClassifier(criterion='entropy',n_estimators=300,random_state= 7, n_jobs=-1)
X_train_rf = X_train.fillna(0)
model.fit(X_train_rf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [9]:
#### feature selection by RF
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)
indices = np.argsort(importances) # ascending

#### feature ranking
feature_ranking = [(indices[f],importances[indices[f]]) for f in range(X_train.shape[1])]

In [10]:
#### state of art feature.... 300개 정도... importance ratio 조정!!!
NUM_OF_FEATURES = len([(i,f) for i, f in feature_ranking if f > 0.0005])

In [11]:
NUM_OF_FEATURES 

345

In [12]:
col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X_train.columns}).sort_values(by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values

In [14]:
#### FEATURE SELECTION
X_train = X_train[col]
X_train.shape

(100000, 345)

In [15]:
#### xgb
grid_result = []
param = {}
#### XGB parameters
## General Parameters
param['n_gpus'] = -1
param['tree_method'] = 'gpu_hist'
param['silent'] = 0

## Booster Parameters
param['n_estimators'] = 1000 #요기...
param['learning_rate'] = 0.1
param['min_child_weight'] = 2
param['max_depth'] = 10
param['gamma'] = 0
param['reg_alpha'] =0.01
param['reg_lambda'] = 0.05
param['subsample'] = 0.9
param['colsample_bytree'] = 0.75
param['scale_pos_weight'] = 1

## Learning task parameters
param['num_class'] = 4
param['objective'] = 'multi:softmax'
param['seed'] = 7

model = xgb.XGBClassifier(**param)

In [16]:
 #### step 1 : tuning n_estimators with cross validation
print("===============================================")
print("Find the n_estimators")
xgtrain = xgb.DMatrix(X_train.values, label= y_train.values.reshape(-1,1))
cvresult = xgb.cv(param, xgtrain, num_boost_round = param['n_estimators'], nfold = 5, feval=F1, early_stopping_rounds = 50
                  ,stratified=True, shuffle=True)
print("Optimal n_estimators : %d"%cvresult.shape[0])

Find the n_estimators
Optimal n_estimators : 261


In [17]:
### 원래 base
cvresult['test-f1-mean'].max()

0.72198339999999994

In [10]:
### xgb로 중요한애만 뽑은거...
cvresult['test-f1-mean'].max()

0.72261339999999996