In [None]:
# Credit Card Fraud Detection 데이터 읽어오기
import pandas as pd

data_path = './creditcard.csv'
raw_data = pd.read_csv(data_path)
raw_data.head()

In [None]:
# column들의 이름 확인 (Time, Amount, Class 외에는 Vn 형태로 가려져있음)
raw_data.columns

In [None]:
# Fraud 비율이 0.17%라서 Class의 불균형이 극심함
frauds_rate = round(raw_data['Class'].value_counts()[1]/len(raw_data) * 100, 2)
print('Frauds', frauds_rate, '% of the dataset')

In [None]:
# countplot을 통해 얼마나 차이가 나는지 확인
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=raw_data, x='Class')
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)
plt.show()

In [None]:
# 상태 확인 겸 학습을 시작해보기 위해 특성(X)과 라벨(y) 변수 준비
X = raw_data.iloc[:, 1:-1]
y = raw_data.iloc[:, -1]

X.shape, y.shape

In [None]:
# 훈련용과 검증용 데이터를 7:3로 나누기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

In [None]:
# 데이터의 비율이 잘 나누어졌는지 확인
import numpy as np

np.unique(y_train, return_counts=True)

In [None]:
tmp = np.unique(y_train, return_counts=True)[1]
tmp[1]/len(y_train) * 100

In [None]:
np.unique(y_test, return_counts=True)

In [None]:
tmp = np.unique(y_test, return_counts=True)[1]
tmp[1]/len(y_test) * 100

In [None]:
# 재사용성을 위해 모델의 성능 지표를 반환하는 함수를 미리 준비
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred):
  acc = accuracy_score(y_test, pred)
  pre = precision_score(y_test, pred)
  re = recall_score(y_test, pred)
  f1 = f1_score_score(y_test, pred)
  auc = roc_auc_score(y_test, pred)

  return acc, pre, re, f1, auc

In [None]:
# 모델의 성능지표를 print하는 함수도 미리 준비
def print_clf_eval(y_test, pred):
  confusion = confusion_matrix(y_test, pred)
  acc, pre, re, f1, auc = get_clf_eval(y_test, pred)

  print("=> 오차 행렬")
  print(confusion)
  print("==========")

  print("정확도: {0:.4f}, 정밀도: {1:.4f}".format(acc, pre))
  print("재현율: {0:.4f}, F1: {1:.4f}, AUC: {2:.4f}".format(re, f1, auc))

In [None]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lr_clf.fit(X_train, y_train)
lr_pred = lr.clf.predict(X_test)

print_clf_eval(y_test, lr_pred)

In [None]:
# 결정 나무
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=13, max_depth=3)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

print_clf_eval(y_test, dt_pred)

In [None]:
# 랜덤 포레스트(앙상블 기법)
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=13, n_jobs=1, n_estimators=100)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)

print_clf_eval(y_test, rf_pred)

In [None]:
# Light GBM(앙상블 기법)
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1,
                          boost_from_average=False, verbose=-1)
lgbm_clf.fit(X_train, y_train)
lgbm_pred = lgbm_clf.predict(X_test)

print_clf_eval(y_test, lgbm_pred)

In [None]:
# 미리 학습한 후 모델 성능지표를 반환하는 함수 준비
def get_result(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train)
  pred = model.predict(X_test)

  return get_clf_eval(y_test, pred)

In [None]:
# 여러 모델의 성능을 쉽게 비교하도록 DataFrame으로 만드는 함수 준비
models = [lr_clf, dt_clf, lgbm_clf]
model_names = ['LogisticReg', 'DecisionTree', 'LightGBM']

def get_result_pd(models, model_names, X_train, y_train, X_test, y_test):
  col_names = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
  tmp = []

  for model in models:
    tmp.append(get_result(model, X_train, y_train, X_test, y_test))

  return pd.DataFrame(tmp, columns=col_names, index=model_names)

results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
results

In [None]:
# 성과를 올릴 방법은 없을까? 거래량을 의미하는 Amount를 다시 들여다보자
plt.figure(figsize=(10,5))
sns.distplot(raw_data['Amount'], color='r')

plt.show()

In [None]:
# Amount 컬럼에 StandardScaler를 적용하면 어떨까
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
amount_n = scaler.fit_transform(raw_data['Amount'].values.reshape(-1, 1))

raw_data_copy = raw_data.iloc[:, 1:-2]
raw_data_copy['Amount_Scaled'] = amount_n
raw_data_copy.head()

In [None]:
# distplot 적용
plt.figure(figsize=(10,5))
sns.distplot(raw_data_copy['Amount_Scaled'], color='r')

plt.show()

In [None]:
# 다시 데이터를 훈련용과 테스트용으로 나누기
X = raw_data_copy

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

In [None]:
# 로지스틱 회귀, 결정나무, LightGBM 성과 확인해보기 (성능 변화는 거의 없음)
models = [lr_clf, dt_clf, lgbm_clf]
model_names = ['LogisticReg', 'DecisionTree', 'LightGBM']

results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
results

In [None]:
# ROC 커브를 그려서 모델 확인해보기
from sklearn.metrics import roc_curve

def draw_roc_curve(models, model_names, X_test, y_test):
  plt.figure(figsize=(10,10))

  for model in range(len(models)):
    pred = models[model].predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_Test, pred)
    plt.plot(fpr, tpr, label=model_names[model])

  plt.plot([0,1], [0,1], 'k--', label='random quess')
  plt.title('ROC')
  plt.legend()
  plt.grid()
  plt.show()

draw_roc_curve(models, model_names, X_test, y_test)

In [None]:
# 간단한 실험을 위한 데이터 준비
samples = [1, 7, 9, 16, 36, 39, 45, 45, 46, 48, 51, 100, 101]
tmp_y = [1] * len(samples)
tmp_y

In [None]:
# 간단한 박스플롯 실험을 위한 데이터의 scatter 그림
plt.figure(figsize=(12,4))
plt.scatter(samples, tmp_y)
plt.grid()
plt.show()

In [None]:
# numpy를 이용한 몇몇 지표 확인
np.median(samples)

In [None]:
np.percentile(samples, 25)

In [None]:
np.percentile(samples, 75)

In [None]:
np.percentile(samples, 75) - np.percentile(samples, 25)

In [None]:
iqr = np.percentile(samples, 75) - np.percentile(samples, 25)
iqr * 1.5

In [None]:
# boxplot 직접 그리기 코드
q1 = np.percentile(samples, 25)
q2 = np.median(samples)
q3 = np.percentile(samples, 75)
upper_fence = q3 + iqr * 1.5
lower_fence = q1 - iqr * 1.5

In [None]:
plt.figure(figsize=(12,4))
plt.scatter(samples, tmp_y)
plt.axvline(x=q1, color='black')
plt.axvline(x=q2, color='red')
plt.axvline(x=q3, color='black')
plt.axvline(x=upper_fence, color='black', ls='dashed')
plt.axvline(x=lower_fence, color='black', ls='dashed')
plt.grid()
plt.show()

In [None]:
# 그냥 boxplot
plt.figure(figsize=(2,4))
sns.boxplot(samples)
plt.grid()
plt.show()

In [None]:
# card 데이터도 boxplot 그려보자
plt.figure(figsize=(10,7))
sns.boxplot(data=raw_data[['V13', 'V14', 'V15']])

In [None]:
# outlier의 인덱스를 찾기 위한 코드
def get_outlier(df=None, column=None, weight=1.5):
  fraud = df[df['Class']==1][column]
  quantile_25 = np.percentile(fraud.values, 25)
  quantile_75 = np.percentile(fraud.values, 75)

  iqr = quantile_75 - quantile_25
  iqr_weight = iqr * weight
  lowest_val = quantile_25 - iqr_weight
  highest_val = quantile_75 + iqr_weight

  outlier_index = frad[(fraud < lowest_val) | (fraud > highest_val)].index

  return outlier_index

In [None]:
# outlier 제거
get_outlier(df=raw_data, column='V14', weight=1.5)

In [None]:
raw_data_copy.shape

In [None]:
outlier_index = get_outlier(df=raw_data, column='V14', weight=1.5)
raw_data_copy.drop(outlier_index, axis=0, inplace=True)
raw_data_copy.shape

In [None]:
# X, y, 훈련용 실험용 데이터 다시 선정
X = raw_data_copy

raw_data.drop(outlier_index, axis=0, inplace=True)
y = raw_data.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

In [None]:
# 다시 학습
models = [lr_clf, dt_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'LightGBM']

results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
results

In [None]:
# ROC Curve
draw_roc_curve(models, model_names, X_test, y_test)