In [1]:
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    
    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)d
    #print('='*50)
    return f1_score

In [3]:
train_label=pd.read_csv('../lite_data/train_label_lite.csv')
test_activity = pd.read_csv('../lite_data//test_activity_lite.csv')

In [4]:
X_test_index = test_activity.groupby(by='new_id',sort=False,as_index=False).sum().iloc[:,:1]


In [5]:
X_train = pd.read_csv('../wook_workspace/X_train_extreme.csv')
X_test = pd.read_csv('../wook_workspace/X_test_extreme.csv')

X_train=pd.concat([train_label.iloc[:,:1],X_train],axis=1)
X_test=pd.concat([X_test_index.iloc[:,:1],X_test],axis=1)

train_act = pd.read_csv('../transformed/train_reaal_values_int.csv')
test_act = pd.read_csv('../transformed/test_real_datas_int.csv')



### Segmentation

### 유저의 숙련도와 게임을 얼마나 몰입하는 유저인지 분류

>S1 : 홍문 X heavy

>S2 : 홍문 X light

>S3 : 기본 X heavy

>S4 : 기본 X light

>S5 : 8주에 잠깐 들어온 유저

In [6]:
#나머지 아웃라이어 분류
train_outlier = set(train_act.groupby('new_id').sum().loc[train_act.groupby('new_id').sum().iloc[:,2:].sum(axis=1) == 8.0].index.tolist())
test_outlier = set(test_act.groupby('new_id').sum().loc[test_act.groupby('new_id').sum().iloc[:,2:].sum(axis=1) == 8.0].index.tolist())

In [7]:
#  홍문 유저와 기본 유저 분류
#### hongmun
train_hongmun = set(X_train.loc[X_train.npc_hongmun_by_play_time == 0,:].new_id.values)
train_basic = set(X_train.loc[X_train.npc_hongmun_by_play_time != 0,:].new_id.values)

#### basic
test_hongmun = set(X_test.loc[X_test.npc_hongmun_by_play_time == 0,:].new_id.values)
test_basic = set(X_test.loc[X_test.npc_hongmun_by_play_time != 0,:].new_id.values)

In [8]:
# 헤비유저 라이트 유저 분류

##### 
train_play = train_act.pivot(index='new_id',columns='wk',values='cnt_dt')
train_play = (train_play > 0).rolling(window=3,axis=1).sum().iloc[:,2:]
train_play = train_play.apply(lambda x:1 if 3.0 in x.tolist() else 0,axis=1)

train_heavy = set(train_play.loc[train_play==1].index.tolist())
train_light = set(train_play.loc[train_play==0].index.tolist())

##### 
test_play = test_act.pivot(index='new_id',columns='wk',values='cnt_dt')
test_play = (test_play > 0).rolling(window=3,axis=1).sum().iloc[:,2:]
test_play = test_play.apply(lambda x:1 if 3.0 in x.tolist() else 0,axis=1)

test_heavy = set(test_play.loc[test_play==1].index.tolist())
test_light = set(test_play.loc[test_play==0].index.tolist())

In [9]:
#헤비 유저중에 홍문 유저 구별
train_s1 = train_heavy.intersection(train_hongmun)
test_s1 = test_heavy.intersection(test_hongmun)

In [10]:
len(train_s1)

21254

In [11]:
#라이트 유저중 홍문 유저 구별
train_s2 = train_light.difference(train_outlier).intersection(train_hongmun)
test_s2 = test_light.difference(test_outlier).intersection(test_hongmun)

In [12]:
# 헤비유저 중 기본 유저 구별
train_s3 = train_heavy.intersection(train_basic)
test_s3 = test_heavy.intersection(test_basic)

In [13]:
len(train_s3)

36176

In [14]:
#라이트 유저중 기본유저 구별
train_s4 = train_light.difference(train_outlier).intersection(train_basic)
test_s4 = test_light.difference(test_outlier).intersection(test_basic)

## Data_partition

In [15]:
X_train_s1=pd.merge(X_train,pd.DataFrame(list(train_s1),columns=['new_id']),on='new_id').fillna(0)

X_train_s2=pd.merge(X_train,pd.DataFrame(list(train_s2),columns=['new_id']),on='new_id').fillna(0)

X_train_s3=pd.merge(X_train,pd.DataFrame(list(train_s3),columns=['new_id']),on='new_id').fillna(0)

X_train_s4=pd.merge(X_train,pd.DataFrame(list(train_s4),columns=['new_id']),on='new_id').fillna(0)

X_train_outlier=pd.merge(X_train,pd.DataFrame(list(train_outlier),columns=['new_id']),on='new_id').fillna(0)

In [16]:
y_train=train_label.copy()
dic={'retained':0,'2month':1,'month':2,'week':3}
y_train.iloc[:,1]=[dic[i] for i in y_train.label]

In [17]:
y_train_s1=pd.merge(y_train,X_train_s1.iloc[:,:1],on='new_id')

y_train_s2=pd.merge(y_train,X_train_s2.iloc[:,:1],on='new_id')

y_train_s3=pd.merge(y_train,X_train_s3.iloc[:,:1],on='new_id')

y_train_s4=pd.merge(y_train,X_train_s4.iloc[:,:1],on='new_id')

y_train_outlier=pd.merge(y_train,X_train_outlier.iloc[:,:1],on='new_id')

In [18]:
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    
    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)
    #print('='*50)
    return f1_score

# S1집단 모델링

In [19]:
X_train_cv_s1=X_train_s1.iloc[:,1:].copy()
y_train_cv_s1=y_train_s1.iloc[:,1].copy()

In [20]:
clf_etc_s1=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [21]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s1, y_train_cv_s1)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s1.fit(X_train_cv_s1.iloc[train,:], y_train_cv_s1.iloc[train,])
    score = f1(X_train_cv_s1.iloc[test,], y_train_cv_s1.iloc[test],clf_etc_s1,dic)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_s1[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [5168 7634 3829  370], F1: 0.600


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [5169 7634 3829  370], F1: 0.625


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [5169 7634 3830  370], F1: 0.606


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [5169 7635 3830  371], F1: 0.635


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [5169 7635 3830  371], F1: 0.591

CV F1: 0.611 +/- 0.016


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [22]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s1, y_train_cv_s1)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s1.fit(X_train_cv_s1.iloc[train,:], y_train_cv_s1.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_s1.predict(X_train_cv_s1.iloc[test,]), y_true=y_train_cv_s1.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_s1[train]), score))
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [5168 7634 3829  370], Acc: 0.691


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [5169 7634 3829  370], Acc: 0.703


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [5169 7634 3830  370], Acc: 0.691


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.5s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [5169 7635 3830  371], Acc: 0.699


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [5169 7635 3830  371], Acc: 0.689

CV Acc: 0.695 +/- 0.006


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [39]:
#모델 저장

joblib.dump(clf_etc_s1,'extra_tree_for_s1.pkl')

['extra_tree_for_s1.pkl']

# S2 집단 모델링

In [23]:
clf_etc_s2=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [24]:
X_train_cv_s2=X_train_s2.iloc[:,1:].copy()
y_train_cv_s2=y_train_s2.iloc[:,1].copy()

In [25]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s2, y_train_cv_s2)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s2.fit(X_train_cv_s2.iloc[train,:], y_train_cv_s2.iloc[train,])
    score = f1(X_train_cv_s2.iloc[test,], y_train_cv_s2.iloc[test],clf_etc_s2,dic)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_s2[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [1229 6431 5046 2784], F1: 0.391


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [1229 6431 5046 2784], F1: 0.411


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [1230 6431 5046 2784], F1: 0.423


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.4s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [1230 6431 5047 2784], F1: 0.407


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.0s finished


Fold: 5, Class dist.: [1230 6432 5047 2784], F1: 0.376

CV F1: 0.402 +/- 0.016


[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [26]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s2, y_train_cv_s2)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s2.fit(X_train_cv_s2.iloc[train,:], y_train_cv_s2.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_s2.predict(X_train_cv_s2.iloc[test,]), y_true=y_train_cv_s2.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_s2[train]), score))
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [1229 6431 5046 2784], Acc: 0.591


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [1229 6431 5046 2784], Acc: 0.597


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [1230 6431 5046 2784], Acc: 0.607


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.5s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [1230 6431 5047 2784], Acc: 0.600


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 5, Class dist.: [1230 6432 5047 2784], Acc: 0.591

CV Acc: 0.597 +/- 0.006


In [40]:
#모델 저장

joblib.dump(clf_etc_s2,'extra_tree_for_s2.pkl')

['extra_tree_for_s2.pkl']

# S3 집단 모델링

In [27]:
clf_etc_s3=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [28]:
X_train_cv_s3=X_train_s3.iloc[:,1:].copy()
y_train_cv_s3=y_train_s3.iloc[:,1].copy()

In [29]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s3, y_train_cv_s3)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s3.fit(X_train_cv_s3.iloc[train,:], y_train_cv_s3.iloc[train,])
    score = f1(X_train_cv_s3.iloc[test,], y_train_cv_s3.iloc[test],clf_etc_s3,dic)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_s3[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [12742  3027  5868  7302], F1: 0.767


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   20.4s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [12742  3027  5869  7302], F1: 0.764


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.8s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [12742  3027  5869  7302], F1: 0.746


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.9s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [12743  3027  5869  7303], F1: 0.750


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   20.4s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [12743  3028  5869  7303], F1: 0.743

CV F1: 0.754 +/- 0.010


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [30]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s3, y_train_cv_s3)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s3.fit(X_train_cv_s3.iloc[train,:], y_train_cv_s3.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_s3.predict(X_train_cv_s3.iloc[test,]), y_true=y_train_cv_s3.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_s3[train]), score))
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   20.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [12742  3027  5868  7302], Acc: 0.845


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.6s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [12742  3027  5869  7302], Acc: 0.847


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   20.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [12742  3027  5869  7302], Acc: 0.843


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [12743  3027  5869  7303], Acc: 0.839


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.6s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [12743  3028  5869  7303], Acc: 0.841

CV Acc: 0.843 +/- 0.003


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [41]:
#모델 저장

joblib.dump(clf_etc_s3,'extra_tree_for_s3.pkl')

['extra_tree_for_s3.pkl']

# S4 집단 모델링

In [31]:
clf_etc_s4=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [32]:
X_train_cv_s4=X_train_s4.iloc[:,1:].copy()
y_train_cv_s4=y_train_s4.iloc[:,1].copy()

# F1

In [33]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s4, y_train_cv_s4)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s4.fit(X_train_cv_s4.iloc[train,:], y_train_cv_s4.iloc[train,])
    score = f1(X_train_cv_s4.iloc[test,], y_train_cv_s4.iloc[test],clf_etc_s4,dic)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_s4[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [ 612  477 2664 8748], F1: 0.652


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [ 612  477 2665 8748], F1: 0.674


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.5s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [ 612  478 2665 8748], F1: 0.671


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [ 612  478 2665 8748], F1: 0.625


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 5, Class dist.: [ 612  478 2665 8748], F1: 0.632

CV F1: 0.651 +/- 0.020


# Accuracy Score

In [34]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_s4, y_train_cv_s4)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_s4.fit(X_train_cv_s4.iloc[train,:], y_train_cv_s4.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_s4.predict(X_train_cv_s4.iloc[test,]), y_true=y_train_cv_s4.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_s4[train]), score))
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.6s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [ 612  477 2664 8748], Acc: 0.859


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [ 612  477 2665 8748], Acc: 0.864


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.9s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [ 612  478 2665 8748], Acc: 0.863


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.6s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [ 612  478 2665 8748], Acc: 0.861


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [ 612  478 2665 8748], Acc: 0.860

CV Acc: 0.861 +/- 0.002


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [42]:
#모델 저장

joblib.dump(clf_etc_s4,'extra_tree_for_s4.pkl')

['extra_tree_for_s4.pkl']

# Outlier 집단 모델링

In [35]:
clf_etc_outlier=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

In [36]:
X_train_cv_outlier=X_train_outlier.iloc[:,1:].copy()
y_train_cv_outlier=y_train_outlier.iloc[:,1].copy()

# F1

In [37]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_outlier, y_train_cv_outlier)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_outlier.fit(X_train_cv_outlier.iloc[train,:], y_train_cv_outlier.iloc[train,])
    score = f1(X_train_cv_outlier.iloc[test,], y_train_cv_outlier.iloc[test],clf_etc_outlier,dic)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_outlier[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [ 247 2429 2590  795], F1: 0.098


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.8s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [ 247 2429 2590  795], F1: 0.231


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [ 247 2430 2590  795], F1: 0.169


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [ 247 2430 2591  795], F1: 0.212


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [ 248 2430 2591  796], F1: 0.287

CV F1: 0.200 +/- 0.063


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


# Accuracy Score

In [38]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_outlier, y_train_cv_outlier)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_outlier.fit(X_train_cv_outlier.iloc[train,:], y_train_cv_outlier.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_outlier.predict(X_train_cv_outlier.iloc[test,]), y_true=y_train_cv_outlier.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_outlier[train]), score))
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [ 247 2429 2590  795], Acc: 0.494


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [ 247 2429 2590  795], Acc: 0.522


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [ 247 2430 2590  795], Acc: 0.513


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [ 247 2430 2591  795], Acc: 0.500


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s


Fold: 5, Class dist.: [ 248 2430 2591  796], Acc: 0.496

CV Acc: 0.505 +/- 0.011


[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [43]:
#모델 저장

joblib.dump(clf_etc_outlier,'extra_tree_for_outlier.pkl')

['extra_tree_for_outlier.pkl']