# Optuna를 이용해서 parameter tuning하기

In [1]:
# Import Module

import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import optuna

In [2]:
# Useful Variable

random_state = 475

info = ["LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE"]
delay_n = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
bill_n = ["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6"]
pay_n = ["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]

In [3]:
train = pd.read_csv("C:/Users/chowonjae/Desktop/내부 프로젝트/uci_creditcard-train-0.0-0.0 (1).csv")
test = pd.read_csv("C:/Users/chowonjae/Desktop/내부 프로젝트/uci_creditcard-test-0.0-0.0 (1).csv")

length = len(train)

#Change the column name "default payment next month" -> "default"
train = train.rename(columns = {"default payment next month":"default"})
test = test.rename(columns = {"default payment next month":"default"})

train = train.drop(["ID","sep_idx"], axis = 1)
train_drop_info = train.drop(info, axis = 1)

test = test.drop(["ID","sep_idx"], axis = 1)

# Change Type

train["SEX"] = train["SEX"].astype(np.int)
train["EDUCATION"] = train["EDUCATION"].astype(np.int)
train["MARRIAGE"] = train["MARRIAGE"].astype(np.int)
train["AGE"] = train["AGE"].astype(np.int)
train["default"] = train["default"].astype(np.int)
train[delay_n] = train[delay_n].astype(np.int)

test["SEX"] = test["SEX"].astype(np.int)
test["EDUCATION"] = test["EDUCATION"].astype(np.int)
test["MARRIAGE"] = test["MARRIAGE"].astype(np.int)
test["AGE"] = test["AGE"].astype(np.int)
test["default"] = test["default"].astype(np.int)
test[delay_n] = test[delay_n].astype(np.int)

In [4]:
X_train = train.drop("default", axis = 1)
y_train = train["default"]

X_test = test.drop("default", axis = 1)
y_test = test["default"]

# Function

In [5]:
def f1_score(y_true, y_pred):
    
    true_positive_mask = np.logical_and((y_true == 1), (y_pred == 1))
    true_negative_mask = np.logical_and((y_true == 0), (y_pred == 0))
    condition_positive = (y_true == 1)
    predicted_positive = (y_pred == 1)
    
    precision = np.sum(true_positive_mask) / np.sum(predicted_positive)
    recall = np.sum(true_positive_mask) / np.sum(condition_positive)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    
    acc = (np.sum(true_positive_mask) + np.sum(true_negative_mask)) / len(y_true)
    
    return precision, recall, f1, acc

In [6]:
apr = ["PAY_6", "BILL_AMT6", "PAY_AMT6"]
may = ["PAY_5", "BILL_AMT5", "PAY_AMT5"]
jun = ["PAY_4", "BILL_AMT4", "PAY_AMT4"]
jul = ["PAY_3", "BILL_AMT3", "PAY_AMT3"]
aug = ["PAY_2", "BILL_AMT2", "PAY_AMT2"]
sep = ["PAY_0", "BILL_AMT1", "PAY_AMT1"]
month_list = [apr, may, jun, jul, aug, sep]

def LSTM_Input(df):
    
    total = len(df) * 6 * 3 # 숫자 총 갯수
    df_preprocessing = np.array([.0] * total).reshape(-1, 6 ,3) # 처리한 데이터 저장할 array
    df_unit = np.array([.0] * (18)).reshape(6, 3)
    for i in range(len(df)):
        X_df_i = df.loc[i]
        for j in range(6):
            df_unit[j] = df.loc[i][month_list[j]].values
        df_preprocessing[i] = df_unit
    
    return df_preprocessing

def preprocessing(df, test = False):
    
    if not test:
        y = df["default"]

        one_hot = np.unique([0, 1]).shape[0]
        y_preprocessing = np.eye(one_hot)[y.to_numpy()].reshape(-1, 2)

    ##MinMaxScaling
    from sklearn.preprocessing import MinMaxScaler

    mms = MinMaxScaler()
    mms.fit(df[["LIMIT_BAL"] + bill_n + pay_n])

    X_scaling = mms.transform(df[["LIMIT_BAL"] + bill_n + pay_n])

    X_scaling_df = pd.DataFrame(X_scaling, columns = ["LIMIT_BAL"] + bill_n + pay_n)

    for delay in delay_n:    
        X_scaling_df[delay] = df[delay].reset_index(drop = True)

    X_add_feature = df[info]
    X_add_feature.loc[:,"LIMIT_BAL"] = X_scaling_df["LIMIT_BAL"].copy()
    X_add_feature = X_add_feature.to_numpy()

    X_preprocessing = LSTM_Input(X_scaling_df)
    
    if test:
        return X_preprocessing, X_add_feature
    
    y = df["default"]

    one_hot = np.unique([0, 1]).shape[0]
    y_preprocessing = np.eye(one_hot)[y.to_numpy()].reshape(-1, 2)
        
    return X_preprocessing, X_add_feature, y, y_preprocessing

In [7]:
def load_sampling(train, proportion, size = None, return_default = False):
    
    if return_default:
        return train
    
    if isinstance(size, int) or isinstance(size, str):
        
        if size == "same":
            size = train[train['default'] == 0].shape[0]
        
        succeed_sample = train[train['default'] == 0].shape[0]
        succeed = train[train['default'] == 0].reset_index(drop = True)

        default_sample = train[train['default'] == 1].shape[0]
        idx = np.random.randint(0, default_sample, size)
        default = train[train['default'] == 1].reset_index(drop = True).iloc[idx]
        
        df = pd.concat([succeed, default])
        
        return df.reset_index(drop = True)
    
    
    succeed_sample = train[train['default'] == 0].shape[0]
    succeed = train[train['default'] == 0].reset_index(drop = True)

    default_sample = train[train['default'] == 1].shape[0]
    idx = np.random.randint(0, default_sample, int(len(train) * proportion))
    default = train[train['default'] == 1].reset_index(drop = True).iloc[idx]

    df = pd.concat([succeed, default])
    
    return df.reset_index(drop = True)

In [8]:
train_df, val_df = train_test_split(train, test_size = 0.2, stratify = y_train)
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

X_train_preprocessing, X_train_add_feature, y_train_before, y_train_preprocessing = preprocessing(train_df)
X_val_preprocessing, X_val_add_feature, y_val_before, y_val_preprocessing = preprocessing(val_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [9]:
train_list = [X_train_preprocessing, X_train_add_feature, y_train_before, y_train_preprocessing]
val_list = [X_val_preprocessing, X_val_add_feature, y_val_before, y_val_preprocessing]

In [10]:
with open('D:/원영/Habit_project/train_test_split/train.pkl', 'wb') as f:
    pickle.dump(train_list, f)
    
with open('D:/원영/Habit_project/train_test_split/val.pkl', 'wb') as f:
    pickle.dump(val_list, f)

In [29]:
def load_data():
    with open('D:/원영/Habit_project/train_test_split/train.pkl', 'rb') as f:
        train_list = pickle.load(f)
        
    with open('D:/원영/Habit_project/train_test_split/val.pkl', 'rb') as f:
        val_list = pickle.load(f)
    
    return train_list, val_list

In [34]:
def load_skf_data():
    
    with open('D:/원영/Habit_project/Sampling_pkl/skf_sampling_train_list.pkl', 'rb') as f:
        skf_oversam_train_list = pickle.load(f)
        
    with open('D:/원영/Habit_project/Sampling_pkl/skf_sampling_val_list.pkl', 'rb') as f:
        skf_oversam_val_list = pickle.load(f)
        
    for i in range(5):
        skf_oversam_val_list[i][0][1] =  skf_oversam_val_list[i][0][1].to_numpy()
        for j in range(6):
            skf_oversam_train_list[i][j][1] =  skf_oversam_train_list[i][j][1].to_numpy()
    
    return skf_oversam_train_list, skf_oversam_val_list

# Model

In [40]:
shape = (6,3)

In [41]:
CLASSES = 2
EPOCHS = 100

In [44]:
## For tuning hyperparameter

def optunaModel(trial):
    # Input layer
    input_layer = layers.Input(shape = shape, name = "input1")
    add_feature = layers.Input(shape = (5), name = "input2")
    
    #n_layers = trial.suggest_int("n_layers", 0, 1)
    n_layers = 0
    if n_layers == 0:
        lstm = layers.LSTM(3)(input_layer)
    else:
        lstm = layers.LSTM(3, return_sequences = True)(input_layer)
        for i in range(n_layers):
            if i == n_layers - 1:
                lstm = layers.LSTM(3)(lstm)
            else:
                lstm = layers.LSTM(3, return_sequences = True)(lstm)

    # Add features
    add = layers.Concatenate()([lstm, add_feature])
    
    num_hidden = trial.suggest_int("n_units", 64, 512, log=True)
    num_rate = trial.suggest_int("dropout_rate", 3, 5)
    # output layer
    fc1 = layers.Dense(num_hidden, activation = "relu")(add)
    do1 = layers.Dropout(num_rate * 0.1)(fc1)
    fc2 = layers.Dense(num_hidden, activation = "relu")(do1)
    do2 = layers.Dropout(num_rate * 0.1)(fc2)
    output = layers.Dense(CLASSES, activation = 'sigmoid')(do2)  ## linear output
    
    model = keras.Model(inputs = [input_layer, add_feature], outputs = output)
    
    return model

def create_optimizer(trial):
    # We optimize the choice of optimizers as well as their parameters.
    kwargs = {}
    optimizer_options = ["Adam"]
    optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options)
    if optimizer_selected == "Adam":
        kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True)

    optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)
    return optimizer

def learn(model, optimizer, dataset, mode="eval"):
    binary_crossentropy = tf.metrics.BinaryCrossentropy("binary_crossentropy'", dtype=tf.float32)

    for batch, (features, labels) in enumerate(dataset):
        with tf.GradientTape() as tape:
            y_pred = model(features, training=(mode == "train"))
            loss_value = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(labels, y_pred, from_logits=False))
            if mode == "eval":
                binary_crossentropy(labels, y_pred)
            else:
                grads = tape.gradient(loss_value, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))

    if mode == "eval":
        return binary_crossentropy
    
    
def get_data(trial):
    train_list, val_list = load_skf_data()
    
    skf_train_list_1 = train_list[0][0]
    skf_val_list_1 = val_list[0][0]
    
    BATCHSIZE = trial.suggest_int("batchsize", 32, 256,  log=True)
    
    train_ds = tf.data.Dataset.from_tensor_slices(({'input1': skf_train_list_1[0], 'input2': skf_train_list_1[1]}, skf_train_list_1[3]))
    train_ds = train_ds.shuffle(len(skf_train_list_1[0])).batch(BATCHSIZE)

    valid_ds = tf.data.Dataset.from_tensor_slices(({'input1': skf_val_list_1[0], 'input2': skf_val_list_1[1]}, skf_val_list_1[3]))
    valid_ds = valid_ds.shuffle(len(skf_val_list_1[0])).batch(BATCHSIZE)
    return train_ds, valid_ds

def objective(trial):
    # Get data.
    train_ds, valid_ds = get_data(trial)

    # Build model and optimizer.
    model = optunaModel(trial)
    optimizer = create_optimizer(trial)

    # Training and validating cycle.
    with tf.device("/cpu:0"):
        for _ in range(EPOCHS):
            learn(model, optimizer, train_ds, "train")

        binary_crossentropy = learn(model, optimizer, valid_ds, "eval")

    # Return last validation accuracy.
    return binary_crossentropy.result()

In [45]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials = 15)

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-11-23 13:15:26,203][0m A new study created in memory with name: no-name-604f6adc-b7b9-4e7b-8eaf-7dfae52a2038[0m
[32m[I 2020-11-23 13:31:12,039][0m Trial 0 finished with value: 7.666619300842285 and parameters: {'batchsize': 62, 'n_units': 375, 'dropout_rate': 5, 'optimizer': 'Adam', 'adam_learning_rate': 0.045779610539576604}. Best is trial 0 with value: 7.666619300842285.[0m
[32m[I 2020-11-23 13:45:49,289][0m Trial 1 finished with value: 7.666621208190918 and parameters: {'batchsize': 58, 'n_units': 261, 'dropout_rate': 5, 'optimizer': 'Adam', 'adam_learning_rate': 0.0007419912844442669}. Best is trial 0 with value: 7.666619300842285.[0m
[32m[I 2020-11-23 13:55:43,939][0m Trial 2 finished with value: 0.5376139283180237 and parameters: {'batchsize': 91, 'n_units': 130, 'dropout_rate': 3, 'optimizer': 'Adam', 'adam_learning_rate': 9.863877842868509e-05}. Best is trial 2 with value: 0.5376139283180237.[0m
[32m[I 2020-11-23 13:59:54,025][0m Trial 3 finished with 

Number of finished trials:  15
Best trial:
  Value:  0.5109953880310059
  Params: 
    batchsize: 33
    n_units: 235
    dropout_rate: 3
    optimizer: Adam
    adam_learning_rate: 8.119477310627347e-05
