In [17]:
from sklearn.externals import joblib

## 이 파일은 유저가 게임에 현금을 지불했는지 유무에 따라 Payment 집단과 Non-payment 계층으
## 로 나누어서 Sklearn 모듈의 Extratree 알고리즘을 이용한 모델링을 진행했습니다.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [2]:
from sklearn.metrics import accuracy_score

In [3]:
#모델의 F1 score를 확인하기 위해 함수 생성
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    
    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)
    #print('='*50)
    return f1_score

In [4]:
#원 데이터에서 현금 지불을 안한 유저를 뽑아내기 위해 각 변수에 저장
train_data=pd.read_csv('../Features/train_every_features_0906.csv')
test_data=pd.read_csv('../Features/test_every_features_0906.csv')
train_label=pd.read_csv('../lite_data/train_label_lite.csv')

In [5]:
# train셋과 test셋에 사용할 feafure들을 각 변수에 저장 
X_train1=pd.read_csv('../wook_workspace/X_train_extreme.csv')
X_test1=pd.read_csv('../wook_workspace/X_test_extreme.csv')

In [6]:
X_train2=pd.concat([train_data.iloc[:,0],X_train1],axis=1)
X_test2=pd.concat([test_data.iloc[:,0],X_test1],axis=1)

In [7]:
## 먼저 현질했는지 안했는지 사용자들 나누기
# index내에 payment 지불 관련 데이터로 조건문을 줘서 따로 저장
X_train_payment=X_train2[train_data.payment_amount_sum_stat.isna()==False]
X_test_payment=X_test2[test_data.payment_amount_sum_stat.isna()==False]

X_train_non_payment=X_train2[train_data.payment_amount_sum_stat.isna()]
X_test_non_payment=X_test2[test_data.payment_amount_sum_stat.isna()]

In [8]:
#그후 결측치를 0으로 처리
X_train_payment=X_train_payment.fillna(0)
X_test_payment=X_test_payment.fillna(0)

X_train_non_payment=X_train_non_payment.fillna(0)
X_test_non_payment=X_test_non_payment.fillna(0)

In [9]:
#분류한거 index가 중간에 빠져잇으면 model이 결측값이 데이터셋 중도 있는 것으로 인식하므로 
#index 리셋하고 하나는 id만 나머지 하나는 피쳐만 모아두기

X_train_payment_id=X_train_payment.reset_index().iloc[:,1]
X_train_payment_feature=X_train_payment.reset_index().iloc[:,2:]


X_test_payment_id=X_test_payment.reset_index().iloc[:,1]
X_test_payment_feature=X_test_payment.reset_index().iloc[:,2:]

In [10]:
# 라벨도 현질한 애들 안한애들로 나누기 
label_payment=train_label[train_data.payment_amount_sum_stat.isna()==False]
label_non_payment=train_label[train_data.payment_amount_sum_stat.isna()]

In [11]:
#중간에 index가 없으면 모델이 학습 안되므로
#index를 리셋하여 진행

label_payment=label_payment.reset_index().iloc[:,1:]
label_non_payment=label_non_payment.reset_index().iloc[:,1:]

In [12]:
label_map = {'retained':0,'2month':1,'month':2,'week':3}

# label별로 int값을 주기
label_payment.iloc[:,1] = pd.Series([label_map[l] for l in label_payment.label])
label_non_payment.iloc[:,1] = pd.Series([label_map[l] for l in label_non_payment.label])

In [13]:
X_train_cv_pay=X_train_payment_feature.copy()
y_train_cv_pay=label_payment.iloc[:,1].copy()

In [14]:
clf_etc_pay=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

## 먼저 accuracy score로 평가

In [15]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_pay, y_train_cv_pay)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_pay.fit(X_train_cv_pay.iloc[train,:], y_train_cv_pay.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_pay.predict(X_train_cv_pay.iloc[test,]), y_true=y_train_cv_pay.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_pay[train]), score))
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_pay[train]), score))
    
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [7608  897  700  634], Acc: 0.791
Fold: 1, Class dist.: [7608  897  700  634], Acc: 0.791


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [7609  897  701  634], Acc: 0.790
Fold: 2, Class dist.: [7609  897  701  634], Acc: 0.790


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [7609  898  701  634], Acc: 0.790
Fold: 3, Class dist.: [7609  898  701  634], Acc: 0.790


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [7609  898  701  635], Acc: 0.792
Fold: 4, Class dist.: [7609  898  701  635], Acc: 0.792


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.7s finished


Fold: 5, Class dist.: [7609  898  701  635], Acc: 0.790
Fold: 5, Class dist.: [7609  898  701  635], Acc: 0.790

CV Acc: 0.791 +/- 0.001


[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


## 다음 F1 스코어로 평가

In [16]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_pay, y_train_cv_pay)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_pay.fit(X_train_cv_pay.iloc[train,:], y_train_cv_pay.iloc[train,])
    score = f1(X_train_cv_pay.iloc[test,], y_train_cv_pay.iloc[test],clf_etc_pay,label_map)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_pay[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.6s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [7608  897  700  634], F1: 0.092


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 2, Class dist.: [7609  897  701  634], F1: 0.080


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [7609  898  701  634], F1: 0.102


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.2s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [7609  898  701  635], F1: 0.060


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.5s finished


Fold: 5, Class dist.: [7609  898  701  635], F1: 0.069

CV F1: 0.080 +/- 0.015


[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


In [19]:
# 학습된 모델 저장
joblib.dump(clf_etc_pay,'extra_tree_payment.pkl')

['extra_tree_payment.pkl']

## non payment segment 모델링

> 현금 지불 이력이 아예없는 유저 상대로 모델링 진행

In [20]:
#위의 방식과 동일하게 인덱스 초기화 후 진행
X_train_non_payment_id=X_train_non_payment.reset_index().iloc[:,1]
X_train_non_payment_feature=X_train_non_payment.reset_index().iloc[:,2:]


X_test_non_payment_id=X_test_non_payment.reset_index().iloc[:,1]
X_test_non_payment_feature=X_test_non_payment.reset_index().iloc[:,2:]

In [21]:
X_train_cv_nonpay=X_train_non_payment_feature.copy()
y_train_cv_nonpay=label_non_payment.iloc[:,1].copy()

In [22]:
clf_etc_nonpay=ExtraTreesClassifier(verbose=1,criterion='gini',max_depth = 19,
                             max_features = 350, min_samples_leaf = 1,n_estimators=300,random_state= 7, n_jobs=-1)

## accuracy score로 평가

In [24]:
#### cross validation

### accuracy _score
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_nonpay, y_train_cv_nonpay)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_nonpay.fit(X_train_cv_nonpay.iloc[train,:], y_train_cv_nonpay.iloc[train,])
    score = accuracy_score(y_pred=clf_etc_nonpay.predict(X_train_cv_nonpay.iloc[test,]), y_true=y_train_cv_nonpay.iloc[test])
    
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_nonpay[train]), score))
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train_cv_nonpay[train]), score))
    
    
print('\nCV Acc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   56.8s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


Fold: 1, Class dist.: [12391 19102 19299 19365], Acc: 0.738
Fold: 1, Class dist.: [12391 19102 19299 19365], Acc: 0.738


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   56.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


Fold: 2, Class dist.: [12391 19102 19299 19365], Acc: 0.743
Fold: 2, Class dist.: [12391 19102 19299 19365], Acc: 0.743


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   56.8s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 3, Class dist.: [12391 19102 19299 19366], Acc: 0.738
Fold: 3, Class dist.: [12391 19102 19299 19366], Acc: 0.738


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   55.4s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 4, Class dist.: [12391 19103 19299 19366], Acc: 0.734
Fold: 4, Class dist.: [12391 19103 19299 19366], Acc: 0.734


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   56.8s finished


Fold: 5, Class dist.: [12392 19103 19300 19366], Acc: 0.732
Fold: 5, Class dist.: [12392 19103 19300 19366], Acc: 0.732

CV Acc: 0.737 +/- 0.004


[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


## F1 스코어로 평가

In [23]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7).split(X_train_cv_nonpay, y_train_cv_nonpay)
scores = []

for k, (train, test) in enumerate(kfold):
    clf_etc_nonpay.fit(X_train_cv_nonpay.iloc[train,:], y_train_cv_nonpay.iloc[train,])
    score = f1(X_train_cv_nonpay.iloc[test,], y_train_cv_nonpay.iloc[test],clf_etc_nonpay,label_map)
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train_cv_nonpay[train]), score))
    
print('\nCV F1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   58.0s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 1, Class dist.: [12391 19102 19299 19365], F1: 0.722


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   55.7s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


Fold: 2, Class dist.: [12391 19102 19299 19365], F1: 0.729


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   52.4s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


Fold: 3, Class dist.: [12391 19102 19299 19366], F1: 0.725


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   54.1s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.3s finished


Fold: 4, Class dist.: [12391 19103 19299 19366], F1: 0.719


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   56.3s finished
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 300 out of 300 | elapsed:    0.2s finished


Fold: 5, Class dist.: [12392 19103 19300 19366], F1: 0.716

CV F1: 0.722 +/- 0.005


In [25]:
# 모델저장
joblib.dump(clf_etc_nonpay,'extra_tree_nonpayment.pkl')

['extra_tree_nonpayment.pkl']