In [None]:
import pandas as pd
import numpy as np
from feature_engine.outliers import OutlierTrimmer
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
import keras.models as models
import keras.layers as layers
import keras.metrics as metrics
import keras.regularizers as regularizers
import tensorflow.keras.callbacks as callbacks
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm

warnings.simplefilter('ignore', UserWarning)
warnings.filterwarnings(action='ignore')
np.random.seed(0)

In [None]:
def get_clf_eval(y_test, pred):
    pred_b = (pred > 0.5)
    acc = accuracy_score(y_test, pred_b)
    pre = precision_score(y_test, pred_b, pos_label=1)
    re = recall_score(y_test, pred_b, pos_label=1)
    f1 = f1_score(y_test, pred_b, pos_label=1)
    auc = roc_auc_score(y_test, pred)

    return acc, pre, re, f1, auc

def print_clf_eval(y_test, pred):
    pred_b = (pred > 0.5)
    confusion = confusion_matrix(y_test, pred_b)
    acc, pre, re, f1, auc = get_clf_eval(y_test, pred)

    print("=Confusion matrix=")
    print(confusion)
    print("==================")

    print(f"Acc : {acc:.4f}, Pre : {pre:.4f}")
    print(f"Re : {re:.4f}, F1 : {f1:.4f}, AUC : {auc:.4f}")

In [None]:
data = pd.read_csv('wafer_data.csv')
data

# 데이터셋 소개
- 해당 데이터셋은 반도체 불량 검출을 위한 데이터셋으로 반도체 불량 여부가 class(0 or 1)로 포함되어있다.
- 반도체 공정의 기밀 유지를 위해 모든 feature의 이름은 삭제되어있다.
- feature_1~3 : numeric feature
- feature_4~1558 : binary feature
- class : binary class (0 : 양품, 1 : 불량)

In [None]:
data.info()

In [None]:
data.describe()

## *EDA는 생략되어 있습니다.

# 데이터 전처리

## X, Y 분리

In [None]:
X = data.drop(['Class'], axis=1)
Y = data['Class']

In [None]:
Y.value_counts()

## Train set, Test set 분리

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=777, stratify=Y)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

In [None]:
# 분리 확인
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
# Y 밸런스 확인
Y_train.value_counts()

## 모든 전처리 조합 찾기

In [None]:
class ModelHistory(callbacks.Callback):
    def __init__(self, logs={}):
        self.history = {'loss' : [], 'binary_accuracy' : [], 'val_loss' : [], 'val_binary_accuracy' : []}

    def on_epoch_end(self, batch, logs={}):
        self.history['loss'].append(logs['loss'])
        self.history['binary_accuracy'].append(logs['binary_accuracy'])
        self.history['val_loss'].append(logs['val_loss'])
        self.history['val_binary_accuracy'].append(logs['val_binary_accuracy'])

In [None]:
def keras_log(X_train, Y_train, X_test, Y_test, class_weight, l2, patience, fold):
    AUCScores_log = np.zeros(10)
    model_log = [0 for _ in range(10)]
    for i in range(10, 101, 10):
        tf.random.set_seed(i)
        model_log[int(i/10 - 1)] = models.Sequential(name=f"Keras_log_Random_state_{i}")
        model_log[int(i/10 - 1)].add(layers.Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],), \
                                  kernel_regularizer=regularizers.l2(l2)))
        model_log[int(i/10 - 1)].compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

        kfold = StratifiedKFold(n_splits=fold)
        modelHistory = ModelHistory()
        cb = [callbacks.EarlyStopping(monitor='val_loss', patience=patience, min_delta=0.01), \
             modelHistory]
        
        for train_index, test_index in kfold.split(X_train, Y_train):
            X_train_log, X_val_log = X_train.loc[train_index], X_train.loc[test_index]
            Y_train_log, Y_val_log = Y_train.loc[train_index], Y_train.loc[test_index]
            model_log[int(i/10 - 1)].fit(X_train_log, Y_train_log, validation_data=(X_val_log, Y_val_log), \
                          epochs=50, use_multiprocessing=True, workers=-1, verbose=0, \
                          callbacks=cb, class_weight=class_weight)

        model_log_pred = model_log[int(i/10 - 1)].predict(X_test)
        AUCScores_log[int(i/10 - 1)] = roc_auc_score(Y_test, model_log_pred)

    return AUCScores_log.mean().round(8)

In [None]:
def infinite_learning(data):
    toScale = ["feature_1", "feature_2" , "feature_3"]
    iqr_fold = [1, 1.25, 1.5, 3]
    #iqr_fold.reverse()
    corr_threshold = [0.5, 0.6, 0.7, 0.8, 0.9]
    #corr_threshold.reverse()
    stdScaler = StandardScaler()
    mmScaler = MinMaxScaler()
    robustScaler = RobustScaler()
    maxabsScaler = MaxAbsScaler()
    scalers = [stdScaler, mmScaler, robustScaler, maxabsScaler]
    #scalers.reverse()
    smote = SMOTE(random_state=0, k_neighbors=3)
    randomOver = RandomOverSampler(random_state=0)
    borderSmote = BorderlineSMOTE(random_state=0, k_neighbors=3)
    adasyn = ADASYN(random_state=0, n_jobs=-1)
    oversamplers = ['weight', smote, randomOver, borderSmote, adasyn]
    #oversamplers.reverse()
    l2s = [0.001, 0.0001]
    #l2s.reverse()
    patiences = [10, 50]
    #patiences.reverse()
    folds = [5, 10]
    #folds.reverse()
    
    X = data.drop(['Class'], axis=1)
    Y = data['Class']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=777, stratify=Y)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    Y_train.reset_index(drop=True, inplace=True)
    Y_test.reset_index(drop=True, inplace=True)
    
    for i in X_train.columns:
        if len(X_train[i].unique()) == 1:
            X_train.drop(i, axis=1, inplace=True)
            X_test.drop(i, axis=1, inplace=True)
    
    for dup in [True]:
        for iqr in iqr_fold:
            trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=iqr, variables=toScale)
            X_train_iqr = trimmer.fit_transform(X_train)
            Y_train_iqr = Y_train[X_train_iqr.index]
            X_test_iqr = X_test.copy()
            Y_test_iqr = Y_test.copy()
            X_train_iqr.reset_index(drop=True, inplace=True)
            Y_train_iqr.reset_index(drop=True, inplace=True)
            X_test_iqr.reset_index(drop=True, inplace=True)
            Y_test_iqr.reset_index(drop=True, inplace=True)
            
            for corr_th in corr_threshold:
                corr = []
                for i in X_train_iqr.columns:
                    corr.append(abs(np.corrcoef(X_train_iqr[i], Y_train_iqr)[0][1]))
                corrX_Y = pd.DataFrame(columns=['features', 'corr'])
                corrX_Y['features'] = X_train_iqr.columns
                corrX_Y['corr'] = corr
                corr_mat = X_train_iqr.corr()
                corr_mat = abs(corr_mat)
                overCorr = pd.DataFrame(columns=['INDEX', 'COLUMN', 'CORR'])
                INDEX, COLUMN, CORR = [], [], []
                for i in range(len(corr_mat.index)):
                    for j in range(i+1, len(corr_mat.columns)):
                        if corr_mat.iloc[i, j] > corr_th:
                            #print(corr_mat.index[i], corr_mat.columns[j], corr_mat.iloc[i, j])
                            INDEX.append(corr_mat.index[i])
                            COLUMN.append(corr_mat.columns[j])
                            CORR.append(corr_mat.iloc[i, j])
                overCorr['INDEX'] = INDEX
                overCorr['COLUMN'] = COLUMN
                overCorr['CORR'] = CORR
                for i in overCorr.index:
                    try:
                        if corrX_Y[corrX_Y['features'] == overCorr.loc[i]['INDEX']]['corr'].values[0] <= \
                        corrX_Y[corrX_Y['features'] == overCorr.loc[i]['COLUMN']]['corr'].values[0]:
                            X_train_corr = X_train_iqr.drop([overCorr.loc[i]['INDEX']], axis=1)
                            X_test_corr = X_test_iqr.drop([overCorr.loc[i]['INDEX']], axis=1)
                        else:
                            X_train_corr = X_train_iqr.drop([overCorr.loc[i]['COLUMN']], axis=1)
                            X_test_corr = X_test_iqr.drop([overCorr.loc[i]['COLUMN']], axis=1)
                    except:
                        continue
                        
                for scaler in scalers:
                    toScale = []
                    for i in X_train_corr.columns:
                        if i == 'feature_1':
                            toScale.append(i)
                            continue
                        elif i == 'feature_2':
                            toScale.append(i)
                            continue
                        elif i == 'feature_3':
                            toScale.append(i)
                            continue
                        else:
                            break
                    X_train_scale = X_train_corr.copy()
                    X_test_scale = X_test_corr.copy()
                    for feature in toScale:
                        X_train_scale.loc[:, feature] = scaler.fit_transform(X_train_corr[feature].to_numpy().reshape(-1, 1))
                        X_test_scale.loc[:, feature] = scaler.transform(X_test_corr[feature].to_numpy().reshape(-1, 1))
                            
                    for over in oversamplers:
                        for l2 in l2s:
                            for patience in patiences:
                                for fold in folds:
                                    #print(f"dup:{str(dup)}, iqr:{str(iqr)}, corr_th:{str(corr_th)}, scaler:{str(scaler)}")                         
                                    X_train_over, Y_train_over = X_train_scale.copy(), Y_train_iqr.copy()
                                    X_test_over, Y_test_over = X_test_scale.copy(), Y_test_iqr.copy()
                                    if over != 'weight':
                                        X_train_over, Y_train_over = over.fit_resample(X_train_scale, Y_train_over)

                                    for i in X_train_over.columns:
                                        if len(X_train_over[i].unique()) == 1:
                                            X_train_over.drop(i, axis=1, inplace=True)
                                            X_test_over.drop(i, axis=1, inplace=True)

                                    weight_0 = (1 / Y_train_over.value_counts()[0]) * (Y_train_over.value_counts().sum() / 2.0)
                                    weight_1 = (1 / Y_train_over.value_counts()[1]) * (Y_train_over.value_counts().sum() / 2.0)
                                    class_weight = {0 : weight_0, 1 : weight_1}

                                    score = keras_log(X_train_over, Y_train_over, X_test_over, Y_test_over, class_weight, l2, patience, fold)
                                    result = f"dup:{str(dup)}, iqr:{str(iqr)}, corr_th:{str(corr_th)}, scaler:{str(scaler).split('Scaler')[0]}, oversampler:{str(over).split('(')[0]}, l2:{str(l2)}, patience:{str(patience)}, fold:{str(fold)}, score:{str(score)}"
                                    print(result)

                                    with open("infinite_learning.txt", "a") as f:
                                        f.write(result+'\n')

In [None]:
data = pd.read_csv('wafer_data.csv')
infinite_learning(data)

## Train set Feature Scaling

## 중복 레코드 제거
- 특정 레코드에 과적합되는 것을 방지하기위해 중복 레코드 삭제

In [None]:
temp = pd.concat([X_train, Y_train], axis=1)

In [None]:
temp.drop_duplicates(keep='first', inplace=True)

In [None]:
Y_train = temp['Class']
X_train = temp.drop(['Class'], axis=1)

In [None]:
X_train.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

In [None]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

## 단일 항목을 가진 feature 제거
- 단일 항목을 가진 feature는 학습에 영향을 미치지 않고 추후 다른 항목을 가진 data를 예측할 때 정확도가 떨어질 수 있기 때문에 해당 feature는 삭제한다.

In [None]:
for i in X_train.columns:
    if len(X_train[i].unique()) == 1:
        X_train.drop(i, axis=1, inplace=True)
        X_test.drop(i, axis=1, inplace=True)

In [None]:
# 감소한 피쳐 수 확인
print(X_train.shape, X_test.shape)

## Outlier 처리 #0 안함

In [None]:
toScale = ["feature_1", "feature_2", "feature_3"]

## Outlier 처리 #1 iqr 기준 삭제
- Train set feature_1~3에 대해 IQR 범위 바깥의 이상치들을 제거함.

In [None]:
toScale = ["feature_1", "feature_2" , "feature_3"]

In [None]:
trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=3, variables=toScale)
X_train = trimmer.fit_transform(X_train)

In [None]:
Y_train = Y_train[X_train.index]
Y_test = Y_test[X_test.index]

X_train.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

In [None]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

## Feature간 상관계수 계산 후 제거
- 각 feature와 class 간 피어슨 상관계수를 계산하여 데이터프레임에 저장한다.
- 이후 각 feature 간 피어슨 상관계수를 계산하고 threshold를 넘는 상관관계를 가진 두 feature의 이름과 상관계수를 데이터프레임에 저장한다.

In [None]:
corr = []

for i in X_train.columns:
    corr.append(abs(np.corrcoef(X_train[i], Y_train)[0][1]))
corrX_Y = pd.DataFrame(columns=['features', 'corr'])
corrX_Y['features'] = X_train.columns
corrX_Y['corr'] = corr

In [None]:
corr_mat = X_train.corr()

In [None]:
corr_mat = abs(corr_mat)
#corr_mat.to_csv('./corr_mat.csv', index=True)

In [None]:
overCorr = pd.DataFrame(columns=['INDEX', 'COLUMN', 'CORR'])
INDEX, COLUMN, CORR = [], [], []
for i in range(len(corr_mat.index)):
    for j in range(i+1, len(corr_mat.columns)):
        if corr_mat.iloc[i, j] > 0.7:
            #print(corr_mat.index[i], corr_mat.columns[j], corr_mat.iloc[i, j])
            INDEX.append(corr_mat.index[i])
            COLUMN.append(corr_mat.columns[j])
            CORR.append(corr_mat.iloc[i, j])
overCorr['INDEX'] = INDEX
overCorr['COLUMN'] = COLUMN
overCorr['CORR'] = CORR

In [None]:
overCorr

## Feature간 상관계수 계산 후 제거
- 위에서 계산한 데이터프레임을 바탕으로 두 feature 중 class와의 상관계수가 낮은 feature를 삭제한다.

In [None]:
for i in overCorr.index:
    try:
        if corrX_Y[corrX_Y['features'] == overCorr.loc[i]['INDEX']]['corr'].values[0] <= \
        corrX_Y[corrX_Y['features'] == overCorr.loc[i]['COLUMN']]['corr'].values[0]:
            X_train.drop([overCorr.loc[i]['INDEX']], axis=1, inplace=True)
            X_test.drop([overCorr.loc[i]['INDEX']], axis=1, inplace=True)
        else:
            X_train.drop([overCorr.loc[i]['COLUMN']], axis=1, inplace=True)
            X_test.drop([overCorr.loc[i]['COLUMN']], axis=1, inplace=True)
    except:
        continue

In [None]:
X_train.shape

## Feature Scaling
- Numeric feature인 feature_1~3의 단위를 맞추고 이상치 영향을 줄이기 위해 다양한 scale 방법을 시도한다.
- Test set은 train set을 scale 한 기준을 가지고 scale한다.

## Feature Scaling #0 안함

## Feature Scaling #1 StandardScaler

In [None]:
scaler = StandardScaler()
for feature in toScale:
    X_train.loc[:, feature] = scaler.fit_transform(X_train[feature].to_numpy().reshape(-1, 1))
    X_test.loc[:, feature] = scaler.transform(X_test[feature].to_numpy().reshape(-1, 1))

## Feature Scaling #2 MinMaxScaler

In [None]:
mmScaler = MinMaxScaler()
for feature in toScale:
    X_train.loc[:, feature] = mmScaler.fit_transform(X_train[feature].to_numpy().reshape(-1, 1))
    X_test.loc[:, feature] = mmScaler.transform(X_test[feature].to_numpy().reshape(-1, 1))

## Feature Scaling #3 RobustScaler

In [None]:
robustScaler = RobustScaler()
for feature in toScale:
    X_train.loc[:, feature] = robustScaler.fit_transform(X_train[feature].to_numpy().reshape(-1, 1))
    X_test.loc[:, feature] = robustScaler.transform(X_test[feature].to_numpy().reshape(-1, 1))

## Feature Scaling #4 MaxAbsScaler

In [None]:
maxabsScaler = MaxAbsScaler()
for feature in toScale:
    X_train.loc[:, feature] = maxabsScaler.fit_transform(X_train[feature].to_numpy().reshape(-1, 1))
    X_test.loc[:, feature] = maxabsScaler.transform(X_test[feature].to_numpy().reshape(-1, 1))

## 클래스 불균형
- 이 데이터셋은 극심한 클래스 불균형을 이루고 있어 그대로 학습하면 모델이 올바르게 학습하지 못하는 결과를 낳는다.
- 따라서 오버샘플링이나 언더샘플링, class_weight를 사용하는 기법을 통해 클래스의 불균형을 해소할 필요가 있다.

## 오버샘플링 #1 SMOTE

In [None]:
smote = SMOTE(random_state=2, k_neighbors=3)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

## 오버샘플링 #2 RamdomOverSampling

In [None]:
randomOver = RandomOverSampler(random_state=0)
X_train, Y_train = randomOver.fit_resample(X_train, Y_train)

## 오버샘플링 #3 SMOTENC

In [None]:
smotenc = SMOTENC(random_state=2, categorical_features=range(3, X_train.shape[1]),\
                  k_neighbors=3, n_jobs=-1)
X_train, Y_train = smotenc.fit_resample(X_train, Y_train)

## 오버샘플링 #4 BorderlineSMOTE

In [None]:
borderSmote = BorderlineSMOTE(random_state=2, k_neighbors=3)
X_train, Y_train = borderSmote.fit_resample(X_train, Y_train)

## 오버샘플링 #5 ADASYN

In [None]:
adasyn = ADASYN(random_state=2, n_jobs=-1)
X_train, Y_train = adasyn.fit_resample(X_train, Y_train)

In [None]:
# Y 밸런스 확인
Y_train.value_counts()

## 오버샘플링 진행 후 단일 클래스 가진 피쳐 제거

In [None]:
for i in X_train.columns:
    if len(X_train[i].unique()) == 1:
        X_train.drop(i, axis=1, inplace=True)
        X_test.drop(i, axis=1, inplace=True)

In [None]:
# 감소한 피쳐 수 확인
print(X_train.shape, X_test.shape)

In [None]:
# 오버샘플링 진행 후 중복 데이터 제거
#X_train.drop_duplicates(keep='first', inplace=True)
#Y_train = Y_train.loc[X_train.index]
#X_train.reset_index(drop=True, inplace=True)
#Y_train.reset_index(drop=True, inplace=True)

In [None]:
# Y 밸런스 확인
Y_train.value_counts()

# 모델 학습

## class_weight
- 클래스 불균형을 해소하기위한 기법 중 하나로 클래스 비율에 따라 모델 손실 함수에 가중치를 부여하여 모델이 소수 클래스에 더 많은 '관심'을 갖도록 한다.
- 클래스 비율을 구하는 공식은 구글 텐서플로우 공식 레퍼런스에 소개되어 있다.
https://www.tensorflow.org/tutorials/structured_data/imbalanced_data?hl=ko

In [None]:
weight_0 = (1 / Y_train.value_counts()[0]) * (Y_train.value_counts().sum() / 2.0)
weight_1 = (1 / Y_train.value_counts()[1]) * (Y_train.value_counts().sum() / 2.0)
class_weight = {0 : weight_0, 1 : weight_1}
print(class_weight)

## Keras model history callback class
- 케라스 모델 학습 시 한 epoch가 끝날 때 계산된 loss와 accuracy를 저장한다.

In [None]:
class ModelHistory(callbacks.Callback):
    def __init__(self, logs={}):
        self.history = {'loss' : [], 'binary_accuracy' : [], 'val_loss' : [], 'val_binary_accuracy' : []}

    def on_epoch_end(self, batch, logs={}):
        self.history['loss'].append(logs['loss'])
        self.history['binary_accuracy'].append(logs['binary_accuracy'])
        self.history['val_loss'].append(logs['val_loss'])
        self.history['val_binary_accuracy'].append(logs['val_binary_accuracy'])

## 모델 학습
- 모든 모델 학습 시 random_state 또는 seed를 10 ~ 100 까지 10 단위로 10 개의 모델을 학습하고 각 모델의 ROC_AUC_SCORE를 계산하여 평균을 구하였다.

# Keras Logistic regression

In [None]:
AUCScores_log = np.zeros(10)
model_log = [0 for _ in range(10)]
for i in range(10, 101, 10):
    tf.random.set_seed(i)
    model_log[int(i/10 - 1)] = models.Sequential(name=f"Keras_log_Random_state_{i}")
    model_log[int(i/10 - 1)].add(layers.Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],), \
                              kernel_regularizer=regularizers.l2(0.0001)))
    model_log[int(i/10 - 1)].compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

    kfold = StratifiedKFold(n_splits=5)
    modelHistory = ModelHistory()
    cb = [callbacks.EarlyStopping(monitor='val_loss', patience=10, min_delta=0.01), \
         modelHistory]
    with tqdm(total=5, ascii=True) as pbar:
        for train_index, test_index in kfold.split(X_train, Y_train):
            X_train_log, X_val_log = X_train.loc[train_index], X_train.loc[test_index]
            Y_train_log, Y_val_log = Y_train.loc[train_index], Y_train.loc[test_index]
            model_log[int(i/10 - 1)].fit(X_train_log, Y_train_log, validation_data=(X_val_log, Y_val_log), \
                          epochs=50, use_multiprocessing=True, workers=-1, verbose=0, \
                          callbacks=cb, class_weight=class_weight)
            pbar.update(1)

    model_log_pred = model_log[int(i/10 - 1)].predict(X_test)
    AUCScores_log[int(i/10 - 1)] = roc_auc_score(Y_test, model_log_pred)
    #print_clf_eval(Y_test, model_log_pred)

print(AUCScores_log.mean().round(8))

In [None]:
fig, loss_ax = plt.subplots()
acc_ax = loss_ax.twinx()

loss_ax.plot(modelHistory.history['loss'], 'y', label='train loss')
loss_ax.plot(modelHistory.history['val_loss'], 'g', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='upper left')

acc_ax.plot(modelHistory.history['binary_accuracy'], 'b', label='train acc')
acc_ax.plot(modelHistory.history['val_binary_accuracy'], 'r', label='val acc')
acc_ax.set_ylabel('accuracy')
acc_ax.legend(loc='lower left')

plt.show()

# Keras MLP

In [None]:
AUCScores_mlp = np.zeros(10)
model_mlp = [0 for _ in range(10)]
for i in range(10, 101, 10):
    tf.random.set_seed(i)
    model_mlp[int(i/10 - 1)] = models.Sequential(name=f"Keras_mlp_Random_state_{i}")
    model_mlp[int(i/10 - 1)].add(layers.Dense(100, activation='relu', input_shape=(X_train.shape[1],), \
                               kernel_regularizer=regularizers.l2(0.0001)))
    #model_mlp[int(i/10 - 1)].add(layers.Dropout(0.5))
    model_mlp[int(i/10 - 1)].add(layers.Dense(1, activation='sigmoid'))
    model_mlp[int(i/10 - 1)].compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

    kfold = StratifiedKFold(n_splits=5)
    modelHistory = ModelHistory()
    cb = [callbacks.EarlyStopping(monitor='val_loss', patience=10, min_delta=0.01), \
         modelHistory]
    with tqdm(total=5, ascii=True) as pbar:
        for train_index, test_index in kfold.split(X_train, Y_train):
            X_train_mlp, X_val_mlp = X_train.loc[train_index], X_train.loc[test_index]
            Y_train_mlp, Y_val_mlp = Y_train.loc[train_index], Y_train.loc[test_index]
            model_mlp[int(i/10 - 1)].fit(X_train_mlp, Y_train_mlp, validation_data=(X_val_mlp, Y_val_mlp), \
                          epochs=50, use_multiprocessing=True, workers=-1, verbose=0, \
                          callbacks=cb, class_weight=class_weight)
            pbar.update(1)

    model_mlp_pred = model_mlp[int(i/10 - 1)].predict(X_test)
    AUCScores_mlp[int(i/10 - 1)] = roc_auc_score(Y_test, model_mlp_pred)
    #print_clf_eval(Y_test, model_mlp_pred)
        
print(AUCScores_mlp.mean().round(3))

In [None]:
fig, loss_ax = plt.subplots()
acc_ax = loss_ax.twinx()

loss_ax.plot(modelHistory.history['loss'], 'y', label='train loss')
loss_ax.plot(modelHistory.history['val_loss'], 'g', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='upper left')

acc_ax.plot(modelHistory.history['binary_accuracy'], 'b', label='train acc')
acc_ax.plot(modelHistory.history['val_binary_accuracy'], 'r', label='val acc')
acc_ax.set_ylabel('accuracy')
acc_ax.legend(loc='lower left')

plt.show()

# Keras DNN

In [None]:
AUCScores_dnn = np.zeros(10)
model_dnn = [0 for _ in range(10)]
for i in range(10, 101, 10):
    tf.random.set_seed(i)
    model_dnn[int(i/10 - 1)] = models.Sequential()
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu', input_shape=(X_train.shape[1],)))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(100, activation='relu'))
    model_dnn[int(i/10 - 1)].add(layers.Dense(1, activation='sigmoid'))
    model_dnn[int(i/10 - 1)].compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

    kfold = StratifiedKFold(n_splits=5)
    modelHistory = ModelHistory()
    cb = [callbacks.EarlyStopping(monitor='val_loss', patience=10, min_delta=0.01), \
         modelHistory]
    with tqdm(total=5, ascii=True) as pbar:
        for train_index, test_index in kfold.split(X_train, Y_train):
            X_train_dnn, X_val_dnn = X_train.loc[train_index], X_train.loc[test_index]
            Y_train_dnn, Y_val_dnn = Y_train.loc[train_index], Y_train.loc[test_index]
            model_dnn[int(i/10 - 1)].fit(X_train_dnn, Y_train_dnn, validation_data=(X_val_dnn, Y_val_dnn), \
                          epochs=50, use_multiprocessing=True, workers=-1, verbose=0, \
                          callbacks=cb, class_weight=class_weight)
            pbar.update(1)

    model_dnn_pred = model_dnn[int(i/10 - 1)].predict(X_test)
    AUCScores_dnn[int(i/10 - 1)] = roc_auc_score(Y_test, model_dnn_pred)
    #print_clf_eval(Y_test, model_dnn_pred)

print(AUCScores_dnn.mean().round(3))

In [None]:
fig, loss_ax = plt.subplots()
acc_ax = loss_ax.twinx()

loss_ax.plot(modelHistory.history['loss'], 'y', label='train loss')
loss_ax.plot(modelHistory.history['val_loss'], 'g', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='upper left')

acc_ax.plot(modelHistory.history['binary_accuracy'], 'b', label='train acc')
acc_ax.plot(modelHistory.history['val_binary_accuracy'], 'r', label='val acc')
acc_ax.set_ylabel('accuracy')
acc_ax.legend(loc='lower left')

plt.show()

# Sklearn Logistic regression

In [None]:
AUCScores_sk_log = np.zeros(10)
sk_log = [0 for _ in range(10)]
for i in range(10, 101, 10):
    sk_log[int(i / 10 - 1)] = LogisticRegression(random_state=i, solver='liblinear', class_weight=class_weight)
    sk_log[int(i / 10 - 1)].fit(X_train, Y_train)
    sk_log_pred = sk_log[int(i / 10 - 1)].predict(X_test)
    AUCScores_sk_log[int(i/10 - 1)] = roc_auc_score(Y_test, sk_log_pred)
    #print_clf_eval(Y_test, sk_log_pred)

print(AUCScores_sk_log.mean().round(3))

# Sklearn Decision Tree

In [None]:
AUCScores_sk_tree = np.zeros(10)
sk_tree = [0 for _ in range(10)]
for i in range(10, 101, 10):
    sk_tree[int(i/10 - 1)] = DecisionTreeClassifier(random_state=i, max_depth=30, class_weight=class_weight)
    sk_tree[int(i/10 - 1)].fit(X_train, Y_train)
    sk_tree_pred = sk_tree[int(i/10 - 1)].predict(X_test)
    AUCScores_sk_tree[int(i/10 - 1)] = roc_auc_score(Y_test, sk_tree_pred)
    #print_clf_eval(Y_test, sk_tree_pred)
    
print(AUCScores_sk_tree.mean().round(3))

# Sklearn RandomForest

In [None]:
AUCScores_sk_rf = np.zeros(10)
sk_rf = [0 for _ in range(10)]
for i in range(10, 101, 10):
    sk_rf[int(i/10 - 1)] = RandomForestClassifier(random_state=i, n_jobs=-1, n_estimators=10, class_weight=class_weight)
    sk_rf[int(i/10 - 1)].fit(X_train, Y_train)
    sk_rf_pred = sk_rf[int(i/10 -1 )].predict(X_test)
    AUCScores_sk_rf[int(i/10 - 1)] = roc_auc_score(Y_test, sk_rf_pred)
    #print_clf_eval(Y_test, sk_rf_pred)

print(AUCScores_sk_rf.mean().round(3))

# LightGBM

In [None]:
AUCScores_lgbm = np.zeros(10)
lgbm = [0 for _ in range(10)]
for i in range(10, 101, 10):
    lgbm[int(i/10 - 1)] = LGBMClassifier(n_estimators=300, num_leaves=100, n_jobs=-1, \
                          boost_from_average=False, random_state=i, class_weight=class_weight)
    lgbm[int(i/10 - 1)].fit(X_train, Y_train)
    lgbm_pred = lgbm[int(i/10 - 1)].predict(X_test)
    AUCScores_lgbm[int(i/10 - 1)] = roc_auc_score(Y_test, lgbm_pred)
    #print_clf_eval(Y_test, lgbm_pred)

print(AUCScores_lgbm.mean().round(3))

## XGBoost

In [None]:
AUCScores_xgb = np.zeros(10)
xgb = [0 for _ in range(10)]
for i in range(10, 101, 10):
    xgb[int(i/10 - 1)] = XGBClassifier(random_state=i, eta=0.2, max_depth=0, min_child_weight=1, \
                                       n_estimators=500, objective='binary:logistic', scale_pos_weight=Y_train.value_counts()[1]/Y_train.value_counts()[0])
    xgb[int(i/10 - 1)].fit(X_train, Y_train)
    xgb_pred = xgb[int(i/10 -1)].predict(X_test)
    AUCScores_xgb[int(i/10 - 1)] = roc_auc_score(Y_test, xgb_pred)
    #print_clf_eval(Y_test, x_pred)

print(AUCScores_xgb.mean().round(3))

# Summary

In [None]:
indexs = ['Keras MLP', 'Keras DNN', 'Keras logistic regression', 'Sklearn logistic regression', 'Sklearn Decision Tree', 'Sklearn RandomForest', 'LightGBM', 'XGBoost']
columns = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC-AUC']
temp = []

temp.append(get_clf_eval(Y_test, model_mlp_pred))
temp.append(get_clf_eval(Y_test, model_dnn_pred))
temp.append(get_clf_eval(Y_test, model_log_pred))
temp.append(get_clf_eval(Y_test, sk_log_pred))
temp.append(get_clf_eval(Y_test, sk_tree_pred))
temp.append(get_clf_eval(Y_test, sk_rf_pred))
temp.append(get_clf_eval(Y_test, lgbm_pred))
temp.append(get_clf_eval(Y_test, xgb_pred))

results = pd.DataFrame(temp, index=indexs, columns=columns)
results.loc['Keras logistic regression', 'ROC-AUC'] = AUCScores_log.mean().round(8)
results.loc['Keras MLP', 'ROC-AUC'] = AUCScores_mlp.mean().round(8)
results.loc['Keras DNN', 'ROC-AUC'] = AUCScores_dnn.mean().round(8)
results.loc['Sklearn logistic regression', 'ROC-AUC'] = AUCScores_sk_log.mean().round(8)
results.loc['Sklearn Decision Tree', 'ROC-AUC'] = AUCScores_sk_tree.mean().round(8)
results.loc['Sklearn RandomForest', 'ROC-AUC'] = AUCScores_sk_rf.mean().round(8)
results.loc['LightGBM', 'ROC-AUC'] = AUCScores_lgbm.mean().round(8)
results.loc['XGBoost', 'ROC-AUC'] = AUCScores_xgb.mean().round(8)

results.sort_values(by=['ROC-AUC'], axis=0, ascending=False)