# 개요

* 아래의 목적/이유로 참가한 스터디에 대한 기록
  * SQLD취득 후 장기 미사용 & GPT를 통한 SQL사용 등으로 많이 잊은 SQL을 복기
  * 기존에 사용해 본 Optuna가 아닌 Autogluon이 커리큘럼에 있어 익혀보고자 함
  * 기존에 관심있던 XAI(설명가능한 AI)를 익히고자 함

* 4주차 과제 진행
  * 지정과제에 대한 EDA, 전처리, 데이터마트(CSV파일)만들기
    * 데이터마트는 sqlite3으로 DB형태로 만듦 
  * 변수에 대한 설명 확인
  * 수치/명목형 변수로 나누어 EDA 및 전처리 진행
  * 향후 Test데이터 등에도 사용하기 위해 전처리 함수로 정리

# 4주차 과제 

## 과제 설명

* 과제 : 월간 데이콘 신용카드 사용자 연체 예측 AI 경진대회
  * https://dacon.io/competitions/official/235713/overview/description
* 아래 내용 진행해보기
  * 불균형데이터에 대해 다양한 불균형처리기법 사용해보기
  * 최종 모델 결정해보기

## 전처리 해둔 데이터 읽고 데이터셋 나누기

In [None]:
from pkb_sqlite3 import DB_sqlite3

db_controller = DB_sqlite3('Dacon_creditcard_overdue.db')
df_train = db_controller.search_db_show_df('SELECT * FROM train')
df_train_pre = db_controller.search_db_show_df('SELECT * FROM train_pre')
df_test = db_controller.search_db_show_df('SELECT * FROM test_pre')
df_sample_submission = db_controller.search_db_show_df('SELECT * FROM sample_submission')

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

train = pd.concat([df_train_pre, df_train['credit']], axis=1)
x_test = df_test.copy()

x_train, x_validate = train_test_split(train, test_size=0.3, random_state=42, stratify=train['credit'])

## 불균형데이터 처리실습

### Undersampling - RUS

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

# RandomUnderSampler 적용
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_rus.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (6765, 35)


### Undersampling - ENN

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

# ENN 적용
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_enn.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (5712, 35)


### Undersampling - Tomek Links

In [None]:
from imblearn.under_sampling import TomekLinks

# Tomek Links 적용
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_tomek.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (14117, 35)


### Oversampling - ROS

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Random Oversampling 적용
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_ros.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (35631, 35)


### Oversampling - SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# SMOTE Oversampling
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_smote.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (35631, 35)


### Oversampling - ADASYN

In [None]:
from imblearn.over_sampling import ADASYN

# ADASYN Oversampling
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_adasyn.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (35665, 35)


### Hybrid Method - SMOTEENN

In [None]:
from imblearn.combine import SMOTEENN

# SMOTEENN Oversampling
smoteenn = SMOTEENN(random_state=42)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_smoteenn.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (10892, 35)


### Hybrid Method - SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek

# SMOTETomek Oversampling
smotetomek = SMOTETomek(random_state=42)
X_smotetomek, y_smotetomek = smotetomek.fit_resample(x_train.drop(columns=['credit']), x_train['credit'])

In [None]:
print(f"""* 적용 전 {x_train.drop(columns=['credit']).shape}
* 적용 후 {X_smotetomek.shape}""")

* 적용 전 (18519, 35)
* 적용 후 (31997, 35)


## LightGBM활용한 불균형처리별 성능비교

* 수업 중 가장 빠른 모델이었던 LightGBM을 활용해서 비교

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

def compute_for_samplers(train, test, sampler, return_metric_only= False):
  
  X_sampled, y_sampled = sampler.fit_resample(train.drop(['credit'], axis = 1), train["credit"])
  model = LGBMClassifier(random_state = 42)
  model.fit(X_sampled, y_sampled)
  y_pred = model.predict(test.drop(['credit'], axis = 1))
  y_proba = model.predict_proba(test.drop(['credit'], axis = 1))
  y_test = test['credit']

  accuracy = accuracy_score(y_test, y_pred)
  logloss = log_loss(y_test, y_proba)
  cf = confusion_matrix(y_test, y_pred)
  if return_metric_only:
    return accuracy, cf, auc
  else:
    return {'acc':accuracy,
            'logloss':logloss,
            'cf':cf,
            'X_sampled' : X_sampled,
            'y_sampled' : y_sampled,
            'model' : model,
            'y_pred' : y_pred,
            'y_proba' : y_proba}

In [None]:
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

sampler = {'RUS':RandomUnderSampler(random_state=42),
            'ENN':EditedNearestNeighbours(),
            'TOMEKLINKS':TomekLinks(),
            'ROS':RandomOverSampler(random_state=42),
            'SMOTE':SMOTE(random_state=42),
            'ADASYN':ADASYN(random_state=42),
            'SMOTEENN':SMOTEENN(random_state=42),
            'SMOTETomek':SMOTETomek(random_state=42)}
sampler_result = dict()

In [None]:
for each_sampler in sampler:
    print(f'\n{each_sampler}')
    sampler_result[each_sampler] = compute_for_samplers(x_train, 
                                                        x_validate, 
                                                        sampler[each_sampler]
                                                        )


RUS
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1262
[LightGBM] [Info] Number of data points in the train set: 6765, number of used features: 33
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612

ENN
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 5712, number of used features: 33
[LightGBM] [Info] Start training from score -0.929419
[LightGBM] [Info] Start training from score -4.607273
[LightGBM] [Info] Start training from score -0.518794

TOMEKLINKS
[LightGBM] [In

In [None]:
df = pd.DataFrame(sampler_result)
df.loc[['acc','logloss','cf']].transpose().sort_values('logloss')

Unnamed: 0,acc,logloss,cf
ADASYN,0.692618,0.789301,"[[12, 139, 816], [7, 456, 1417], [7, 54, 5030]]"
SMOTE,0.69287,0.793357,"[[5, 137, 825], [3, 455, 1422], [4, 47, 5040]]"
SMOTETomek,0.691358,0.797204,"[[10, 141, 816], [13, 449, 1418], [4, 58, 5029]]"
TOMEKLINKS,0.692996,0.808826,"[[36, 119, 812], [24, 405, 1451], [14, 17, 5060]]"
ROS,0.612875,0.930659,"[[281, 202, 484], [190, 780, 910], [737, 550, ..."
RUS,0.504661,1.011385,"[[378, 255, 334], [412, 834, 634], [1293, 1004..."
SMOTEENN,0.325397,1.237064,"[[387, 426, 154], [558, 1002, 320], [1889, 200..."
ENN,0.589317,2.293334,"[[357, 0, 610], [633, 2, 1245], [772, 0, 4319]]"


* Under/Over/Hybrid Sampling별로 1가지씩 해보는 것으로 결정
  * TOMEKLINKS
  * ADASYN
  * SMOTETomek

## 모델 성능비교 및 선택

* 전처리한 기본 데이터를 기준으로 모델 비교 해보기(Weight 미적용)

In [None]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import time

# 모델 리스트
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
}

rsts = {}

sampler = {'TOMEKLINKS':TomekLinks(),
           'ADASYN':ADASYN(random_state=42),
           'SMOTETomek':SMOTETomek(random_state=42)}

for each_sample_type in sampler:
    X_train, y_train = sampler[each_sample_type].fit_resample(x_train.drop(['credit'], axis = 1), x_train["credit"])
    X_test, y_test = sampler[each_sample_type].fit_resample(x_validate.drop(['credit'], axis = 1), x_validate["credit"])

    # 학습 및 평가
    for name, model in models.items():
        print(f"\n{name} + {each_sample_type}")
        start = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logloss = log_loss(y_test, y_proba)
        cf = confusion_matrix(y_test, y_pred)
        rsts[f"{name} + {each_sample_type}"] = {'acc':accuracy,
                    'logloss':logloss,
                    'cf':cf,
                    'model' : model,
                    'y_pred' : y_pred,
                    'y_proba' : y_proba,
                    'time' : time.time() - start,
                    'classification_report':classification_report(y_test, y_pred)}
        
        print(f"""* logloss : {logloss}""")


Random Forest + TOMEKLINKS
* logloss : 0.7620037176688708

LightGBM + TOMEKLINKS
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 14117, number of used features: 33
[LightGBM] [Info] Start training from score -1.834230
[LightGBM] [Info] Start training from score -1.759900
[LightGBM] [Info] Start training from score -0.403166
* logloss : 0.7692630280717329

XGBoost + TOMEKLINKS
* logloss : 0.7849133307714561

CatBoost + TOMEKLINKS
* logloss : 0.7740753039975535

Extra Trees + TOMEKLINKS
* logloss : 0.8551848758758333

Random Forest + ADASYN
* logloss : 0.7478335144023597

LightGBM + ADASYN
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003083 seconds.
You can set `force_col_wise=tr

In [None]:
df_results = pd.DataFrame(rsts).loc[['acc','logloss','cf','time'],:]
df_results.transpose().sort_values('logloss')

Unnamed: 0,acc,logloss,cf,time
\nLightGBM + ADASYN,0.637518,0.74722,"[[3159, 1178, 872], [1976, 1551, 1451], [7, 54...",0.650462
\nRandom Forest + ADASYN,0.64354,0.747834,"[[3166, 1339, 704], [1861, 1966, 1151], [126, ...",9.090805
\nRandom Forest + TOMEKLINKS,0.717713,0.762004,"[[138, 120, 709], [29, 278, 662], [79, 60, 3802]]",2.631492
\nLightGBM + SMOTETomek,0.623706,0.765017,"[[2341, 1148, 754], [1543, 1491, 1199], [4, 41...",0.58901
\nXGBoost + ADASYN,0.630384,0.765741,"[[2817, 1532, 860], [1738, 1868, 1372], [21, 1...",0.861137
\nCatBoost + ADASYN,0.635293,0.767444,"[[2834, 1522, 853], [1671, 1919, 1388], [18, 1...",17.005267
\nLightGBM + TOMEKLINKS,0.711247,0.769263,"[[36, 119, 812], [18, 228, 723], [11, 14, 3916]]",0.349786
\nRandom Forest + SMOTETomek,0.629805,0.769979,"[[2352, 1274, 617], [1454, 1832, 947], [98, 22...",7.562343
\nCatBoost + TOMEKLINKS,0.709035,0.774075,"[[57, 116, 794], [34, 232, 703], [23, 40, 3878]]",6.656712
\nCatBoost + SMOTETomek,0.622261,0.781963,"[[2110, 1413, 720], [1313, 1784, 1136], [20, 1...",15.503353


* logloss와 time을 기준으로 아래 2가지 조합 선정
  * LightGBM + ADASYN
  * Random Forest + TOMEKLINKS

## 불균형 처리 + 모델선택 Case별 비교

### 앞서 테스트를 통해 구한 조합에 대해 테스트

* 아래 조합으로 테스트
  * LightGBM (Weighted)
  * LightGBM + ADASYN
  * Random Forest (Weighted)
  * Random Forest + TOMEKLINKS

In [None]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
import time

# 모델 리스트
models = {
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "LightGBM": LGBMClassifier(class_weight="balanced", random_state=42),
}

rsts_weight = {}

sampler = {'TOMEKLINKS':TomekLinks(),
           'ADASYN':ADASYN(random_state=42)
           }

for each_sample_type in sampler:
    X_train, y_train = sampler[each_sample_type].fit_resample(x_train.drop(['credit'], axis = 1), x_train["credit"])
    X_test, y_test = sampler[each_sample_type].fit_resample(x_validate.drop(['credit'], axis = 1), x_validate["credit"])

    # 학습 및 평가
    for name, model in models.items():
        print(f"\n{name} + {each_sample_type}")
        start = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logloss = log_loss(y_test, y_proba)
        cf = confusion_matrix(y_test, y_pred)
        rsts_weight[f"{name} + {each_sample_type}"] = {'acc':accuracy,
                    'logloss':logloss,
                    'cf':cf,
                    'model' : model,
                    'y_pred' : y_pred,
                    'y_proba' : y_proba,
                    'time' : time.time() - start,
                    'classification_report':classification_report(y_test, y_pred)}


Random Forest + TOMEKLINKS

LightGBM + TOMEKLINKS
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 14117, number of used features: 33
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612

Random Forest + ADASYN

LightGBM + ADASYN
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7336
[LightGBM] [Info] Number of data points in the train set: 35665, number of used features: 33
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [None]:
df_results_weight = pd.DataFrame(rsts_weight).loc[['acc','logloss','cf','time'],:]
df_results_weight.transpose().sort_values('logloss')

Unnamed: 0,acc,logloss,cf,time
LightGBM + ADASYN,0.633198,0.750138,"[[3019, 1317, 873], [1902, 1624, 1452], [6, 54...",1.112855
Random Forest + TOMEKLINKS,0.717543,0.751406,"[[134, 110, 723], [31, 260, 678], [68, 50, 3823]]",2.637113
Random Forest + ADASYN,0.6421,0.757042,"[[3172, 1311, 726], [1891, 1940, 1147], [127, ...",9.652027
LightGBM + TOMEKLINKS,0.602008,0.92511,"[[305, 189, 473], [122, 377, 470], [658, 427, ...",0.504209


### 추가 테스트

* 이진분류가 아닌 경우에 대해 한번 해보고 싶어서 적용
  * Catboost (Weighted)
  * XGBoost (Weighted)

In [None]:
# 클래스 비율 계산
class_counts = x_train['credit'].value_counts()
class_weights = {cls: max(class_counts) / count for cls, count in class_counts.items()}
class_weights

{2.0: 1.0, 1.0: 2.707317073170732, 0.0: 5.266962305986696}

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import time

# 모델 리스트
models = {
    # XGBoost : eval_metric="mlogloss
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42),
    # CatBoost : {0.0: 1.0, 1.0: 2.707317073170732, 2.0: 5.266962305986696}로 설정
    "CatBoost": CatBoostClassifier(class_weights=[1.0, 2.707317073170732, 5.266962305986696]
                                   , verbose=0, random_state=42),
}

X_train = x_train.drop(['credit'], axis = 1)
y_train = x_train['credit']
X_test = x_validate.drop(['credit'], axis = 1)
y_test = x_validate['credit']
rsts_additional = {}

# 학습 및 평가
for name, model in models.items():
    print(f"\n{name}")
    start = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    logloss = log_loss(y_test, y_proba)
    cf = confusion_matrix(y_test, y_pred)
    rsts_additional[name] = {'acc':accuracy,
                  'logloss':logloss,
                  'cf':cf,
                  'model' : model,
                  'y_pred' : y_pred,
                  'y_proba' : y_proba,
                  'time' : time.time() - start,
                  'classification_report':classification_report(y_test, y_pred, zero_division=0)
                  }
    
    print(f"""* logloss : {logloss}""")


XGBoost
* logloss : 0.7854300507522355

CatBoost
* logloss : 0.8862784774127822


In [None]:
df_results_additional = pd.DataFrame(rsts_additional).loc[['acc','logloss','cf','time'],:]
df_results_additional.transpose().sort_values('logloss')

Unnamed: 0,acc,logloss,cf,time
XGBoost,0.696019,0.78543,"[[56, 142, 769], [20, 524, 1336], [26, 120, 49...",1.066831
CatBoost,0.693122,0.886278,"[[0, 134, 833], [0, 444, 1436], [0, 33, 5058]]",10.805283
