In [35]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from pprint import pprint
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier # rf분류기

In [36]:
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)
    #print('='*50)
    return f1_score

In [37]:
#### load class
train_label = pd.read_csv('../lite_data/train_label_lite.csv')
label_map = {'retained':0,'2month':1,'month':2,'week':3}
inv_map = {label_map[k]:k for k in label_map.keys()}
y_train = pd.Series([label_map[l] for l in train_label.label])

In [38]:
X_train_base = pd.read_csv('X_train_0902.csv')

In [68]:
X_train_base.shape

(100000, 605)

In [69]:
X_train_new_time =pd.read_csv('X_train_real_play_time.csv').iloc[:,1:]

In [70]:
len(X_train_new_time.columns)

55

In [52]:
cols = ['median_time_series','mean_time_series','var_time_series','skew_time_series','kurt_time_series',
'MA_1_time_series','MA_2_time_series','MA_3_time_series','MA_4_time_series','MA_5_time_series' ,'cycle_time_series']

In [71]:
X_train = pd.concat((X_train_base.loc[:,X_train_base.columns[pd.Series(X_train_base.columns.tolist()).apply(lambda x: [x for c in cols if c in x] == [])].tolist()],X_train_new_time),axis = 1)

In [72]:
X_train.shape

(100000, 605)

In [73]:
X_train_rf = X_train.fillna(0)

---

### Random forest로 변수 성능 판단...
* 현재 state-of-art(495)   CV F1: 0.720 +/- 0.005
* interaction 추가(525)    CV F1: 0.720 +/- 0.005 성능이 개선되지 않음
* CAT 대신 count 추가(579)    CV F1: 0.722 +/- 0.007

CV F1: 0.722 +/- 0.004

In [74]:
X_train_rf.shape

(100000, 605)

In [75]:
model = RandomForestClassifier(criterion='gini',max_depth = 19, max_features = 290, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [76]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7,shuffle = True).split(X_train_rf, y_train)
scores = []
for k, (train, test) in enumerate(kfold):
    model.fit(X_train_rf.iloc[train,:], y_train[train])
    score = f1(X_train_rf.iloc[test,:], y_train[test], model,label_map)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold: 1, Class dist.: [20000 20000 20000 20000], F1: 0.721
Fold: 2, Class dist.: [20000 20000 20000 20000], F1: 0.715
Fold: 3, Class dist.: [20000 20000 20000 20000], F1: 0.725
Fold: 4, Class dist.: [20000 20000 20000 20000], F1: 0.724
Fold: 5, Class dist.: [20000 20000 20000 20000], F1: 0.716

CV F1: 0.720 +/- 0.004


__변수의 중요도 파악__

In [33]:
X_train_base.columns[-30:]

Index(['('cnt_dt', 'play_time')', '('play_time', 'game_combat_time')',
       '('play_time', 'quest_hongmun')', '('play_time', 'cnt_use_buffitem')',
       '('cnt_dt', 'game_combat_time')', '('play_time', 'quest_exp')',
       '('play_time', 'npc_exp')', '('game_combat_time', 'cnt_use_buffitem')',
       '('npc_exp', 'game_combat_time')', '('quest_exp', 'game_combat_time')',
       '('cnt_dt', 'get_money')', '('cnt_dt', 'item_hongmun')',
       '('game_combat_time', 'get_money')', '('play_time', 'get_money')',
       '('cnt_dt', 'cnt_use_buffitem')', '('cnt_dt', 'npc_exp')',
       '('get_money', 'cnt_use_buffitem')', '('npc_exp', 'cnt_use_buffitem')',
       '('cnt_dt', 'quest_hongmun')', '('npc_exp', 'quest_exp')',
       '('npc_exp', 'get_money')', '('cnt_dt', 'quest_exp')',
       '('whisper_chat', 'party_chat')', '('quest_exp', 'cnt_use_buffitem')',
       '('npc_hongmun', 'item_hongmun')',
       '('quest_hongmun', 'game_combat_time')', '('cnt_dt', 'whisper_chat')',
       '('que

In [9]:
fi= model.feature_importances_

In [10]:
cols = X_train_rf.columns.tolist()

In [13]:
results = pd.DataFrame([(cols[i],x)for i,x in enumerate(fi)], columns=['features','importance']).sort_values(by='importance',ascending = False)

In [15]:
results = results.reset_index()