In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit # 正負（0,1)の比率を変えずにデータを学習、検証用に分割

from sklearn.metrics import confusion_matrix # 混同行列
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score # 混同行列_精度指標値
from sklearn.metrics import roc_auc_score # ROC曲線

import lightgbm as lgb # LightGBM
from tqdm import tqdm # プログレスバー

# アンダーサンプリング
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# データ取得
creditcard = pd.read_csv("./creditcard.csv")

In [3]:
# データ全体の散布図表示
# data = creditcard[creditcard['Amount']<5000][['Time', 'Amount', 'Class']]
# plt.figure(figsize=[10,5])
# plt.grid()
# sns.scatterplot(data = data, x ='Time', y = 'Amount', hue = 'Class', alpha=0.8)

In [4]:
#  混同行列評価出力
def plot_confusion_matrix(predict, y_test):
    pred =  np.where(predict > 0.5, 1, 0)
    cm = confusion_matrix(y_test, pred)
    matrix = pd.DataFrame(cm)
    matrix.columns = [['予測_負例(0)', '予測_正例(1)']]
    matrix.index = [['実際_負例(0)', '実際_正例(1)']]
    return matrix

In [5]:
# LightBGMアルゴリズムで学習する
def lgbm_train(X_train, y_train):
    model = lgb.LGBMClassifier(objective='binary',
                               metric='auc',
                               boosting_type='gbdt',
                               num_leaves=30,
                               learning_rate=0.01,
                               feature_fraction=0.9,
                               subsample=0.8,
                               max_depth=12,
                               min_data_in_leaf=12
                              )
    model.fit(X_train, y_train)
    return model

In [6]:
# アンダーバギングの実施
def bagging(seed):
    # アンダーサンプリング（分類問題のターゲット目的変数が少ない方に数を合わせる）
    sampler = RandomUnderSampler(random_state=seed, replacement=True)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

    #X_train2, X_valid, y_train2, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)
    #model_bagging = lgbm_train(X_train2, X_valid, y_train2, y_valid)
    
    #学習
    model_bagging = lgbm_train(X_resampled, y_resampled)
    
    return model_bagging

In [9]:
# データの目的変数の正負比率を変えずに学習、検証用に分割
def imbalanced_data_split(X, y, t_size):
    sss = StratifiedShuffleSplit(n_splits=1,
                                 train_size=0.9,
                                 test_size=t_size,
                                 random_state=42)
    
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        return X_train, X_test, y_train, y_test

In [7]:
# データ分割
X = creditcard.drop(columns='Class', axis=1)
y = creditcard['Class']

# 正負の比率確認
print(len(y[y==0]))
print(len(y[y==1]))

In [10]:
# 目的変数の比率を揃えてデータ分割
X_train, X_test, y_train, y_test = imbalanced_data_split(X, y, t_size=0.1)

# 正負の比率確認
print(len(y_train[y_train==0]))
print(len(y_train[y_train==1]))

print(len(y_test[y_test==0]))
print(len(y_test[y_test==1]))

In [13]:
# 学習
models = []
for i in tqdm(range(50)):
    models.append(bagging(i))

  2%|█▋                                                                                 | 1/50 [00:00<00:06,  7.13it/s]



100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:09<00:00,  5.36it/s]


In [14]:
# 予測
y_preds = []
for m in tqdm(models):
    y_preds.append(m.predict(X_test))

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 21.43it/s]


In [16]:
# 評価　作った学習モデル全てで判断した結果を、全学習モデル数で割って平均の答えを出す。フロート型になってしまうので後ほどintに変換
y_preds_bagging = sum(y_preds)/len(y_preds)
auc = roc_auc_score(y_test, y_preds_bagging)
print(f'AUC： {round(auc, 2)}')

AUC： 0.95


In [17]:
# 混同行列
matrix = plot_confusion_matrix(y_preds_bagging, y_test)
matrix

Unnamed: 0,予測_負例(0),予測_正例(1)
実際_負例(0),27573,859
実際_正例(1),4,45


In [18]:
# 評価

#intとfloatの比較だとエラーになるためintに揃える（切り捨てでなく四捨五入が必要かも）
y_preds_bagging_int = [int(num) for num in y_preds_bagging.tolist()]

print('Accuracy = ', accuracy_score(y_true=y_test, y_pred=y_preds_bagging_int).round(decimals=3))
print('Precision = ', precision_score(y_true=y_test, y_pred=y_preds_bagging_int, zero_division=0).round(decimals=3))
print('Recall = ', recall_score(y_true=y_test, y_pred=y_preds_bagging_int).round(decimals=3))
print('F1 score = ', f1_score(y_true=y_test, y_pred=y_preds_bagging_int).round(decimals=3))

Accuracy =  0.998
Precision =  0.457
Recall =  0.878
F1 score =  0.601


In [24]:
# 特徴量の重要度
importance = []
for m in tqdm(models):
    importance.append(pd.DataFrame(m.feature_importances_, index=X.columns, columns=['importance']))
    

100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 5204.11it/s]


In [37]:
# 各学習モデルの特徴量の重要度から平均値を算出する
importance_sum = sum(importance)/len(importance)

In [38]:
importance_sum.sort_values('importance', ascending=False, inplace=True)
display(importance_sum)

Unnamed: 0,importance
V14,313.22
V4,292.86
V12,167.48
Amount,152.48
V10,145.82
V20,143.48
V19,132.46
V17,117.92
V7,116.74
V8,115.06
