In [29]:
import numpy as np
import pandas as pd
import json

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.metrics import explained_variance_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

# ignore Warnings
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

In [25]:
train_df = pd.read_csv("train_change.csv", encoding='euc-kr')

In [26]:
train_df2 = train_df.drop(['Unnamed: 0', '분석데이터'], axis=1)

In [27]:
data_copy = train_df2.copy()

In [28]:
def remove_outlier_0(df, column):
    fraud_column_data = df[df['label']==0][column]
    quantile_25 = np.percentile(fraud_column_data.values, 25)
    quantile_75 = np.percentile(fraud_column_data.values, 75)
    
    IQR = quantile_75 - quantile_25
    IQR_weight = IQR * 1.5
    
    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight
    
    outlier_idx = fraud_column_data[ (fraud_column_data < lowest) | (fraud_column_data > highest) ].index
    print(len(outlier_idx))
    df.drop(outlier_idx, axis=0, inplace=True)
    print(df.shape)
    return df

In [7]:
train_df_y = data_copy['label']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data_copy, train_df_y, test_size=0.2, random_state=42)

In [18]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "verbosity": -1,
        "learning_rate": 0.01,
        "n_estimators": 10000,
        "objective": "binary",
        "metric": "binary_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 40),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    x_train, x_test, y_train, y_test = train_test_split(data_copy, train_df_y, test_size=0.2, random_state=42)
    x_train = x_train.drop(['label'], axis=1)
    x_test = x_test.drop(['label'], axis=1)

    
    model = lgb.LGBMClassifier(**params_lgb)
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_train, y_train), (x_test, y_test)],
        early_stopping_rounds=100,
        verbose=False,
    )

    lgb_pred = model.predict_proba(x_test)
    log_score = log_loss(y_test, lgb_pred)
    
    return log_score

In [19]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-10-10 20:28:01,930][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2021-10-10 20:28:28,654][0m Trial 0 finished with value: 0.18803987012732024 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 30, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.18803987012732024.[0m
[32m[I 2021-10-10 20:29:09,718][0m Trial 1 finished with value: 0.18302181820202978 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 39, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 1 with value: 0.18302181820202978.[0m
[32m[I 2021-10-10 20:29:38,657][0m Trial 2 finished with value: 0.18141440357018968 and paramet

Best Score: 0.17483358170286697
Best trial: {'reg_alpha': 1.987904330777592e-05, 'reg_lambda': 0.028054003730936226, 'max_depth': 21, 'num_leaves': 141, 'colsample_bytree': 0.5109126733153162, 'subsample': 0.9787092394351908, 'subsample_freq': 8, 'min_child_samples': 95, 'max_bin': 469}


In [20]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

Accuracy: 0.17483358170286697


In [62]:
'''lgbm_model = lgb.LGBMClassifier(
        learning_rate = 0.01,
        max_depth = -1,
        boosting_type = 'gbdt',
        objective = 'binary',
        metric = 'binary_logloss',
        is_training_metric = True,
        num_leaves = 40,
        min_child_samples = 20,
        colsample_bytree = 0.9,
        subsample = 0.8,
        subsample_freq = 5,
        seed = 42)'''

In [63]:
'''lgbm_model_cv = lgb.LGBMClassifier()'''

In [64]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [82]:
'''lgbm_param_grid = {'max_depth': [-1, 3, 5, 7, 9],
                  'subsample': [0.6, 0.8, 1.0],
                  'subsample_freq' : [0,1,3,5],
                  'min_child_weight': [1, 3, 5, 10],
                  'colsample_bytree': [0.6, 0.8,0.9, 1.0],
                  }

hr_grid = GridSearchCV(estimator=lgbm_model,
                       param_grid=lgbm_param_grid,
                       scoring='roc_auc',
                       n_jobs=-1,
                       cv=3,
                       refit=True, 
                       return_train_score=True)

start_time = timer(None)
hr_grid.fit(x_train, y_train)
timer(start_time)'''


 Time taken: 3 hours 12 minutes and 18.82 seconds.


In [None]:
## 최고성능
best_score = hr_grid.best_score_
# 최고성능을 내는 행을 찾아냄
best_row = hr_grid.best_index_

# 최적 초모수: max_depth, subsample
best_max_depth = hr_grid.best_params_["max_depth"]
best_max_subsample = hr_grid.best_params_["subsample"]
best_min_child_weight = hr_grid.best_params_["min_child_weight"]
best_colsample_bytree = hr_grid.best_params_["colsample_bytree"]
best_gamma = hr_grid.best_params_["gamma"]


nl = '\n'
print(f'예측모형성능(AUC):  \t {best_score:.3f}{nl}\
        인덱스:           \t {best_row}{nl}\
        max_depth:      \t {best_max_depth}{nl}\
        subsample:      \t {best_max_subsample}{nl}\
        colsample_bytree:      \t {best_colsample_bytree}{nl}\
        min_child_weight:      \t {best_min_child_weight}{nl}\
        gamma:      \t {best_gamma}')

In [83]:
hr_grid_df = pd.DataFrame(hr_grid.cv_results_)
hr_grid_df.loc[:, ['mean_test_score', "params"]]

Unnamed: 0,mean_test_score,params
0,0.991220,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
1,0.990361,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
2,0.990773,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
3,0.990728,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
4,0.991220,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
...,...,...
4795,0.988314,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
4796,0.988595,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
4797,0.988595,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
4798,0.988595,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."


In [84]:
hr_grid_df[hr_grid_df['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_subsample,param_subsample_freq,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
5,18.285647,0.709245,0.119361,0.010211,0.6,0.5,-1,1,0.8,1,...,0.992577,0.990481,0.991327,0.000902,1,0.999798,0.999787,0.999854,0.999813,2.9e-05
245,19.224772,0.487691,0.11569,0.010343,0.6,1.0,-1,1,0.8,1,...,0.992577,0.990481,0.991327,0.000902,1,0.999798,0.999787,0.999854,0.999813,2.9e-05
485,18.8757,0.351941,0.114023,0.015749,0.6,1.5,-1,1,0.8,1,...,0.992577,0.990481,0.991327,0.000902,1,0.999798,0.999787,0.999854,0.999813,2.9e-05
725,18.986684,0.18274,0.118696,0.003859,0.6,2.0,-1,1,0.8,1,...,0.992577,0.990481,0.991327,0.000902,1,0.999798,0.999787,0.999854,0.999813,2.9e-05
965,19.282173,0.176566,0.121694,0.012714,0.6,5.0,-1,1,0.8,1,...,0.992577,0.990481,0.991327,0.000902,1,0.999798,0.999787,0.999854,0.999813,2.9e-05


In [85]:
pred = hr_grid.predict(x_test)
accuracy_score(y_test, pred)

0.9732847601700061

In [77]:
y_pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [21]:
train_ds = lgb.Dataset(x_train, label = y_train)
test_ds = lgb.Dataset(x_test, label = y_test)

params = {'learning_rate':0.01,
         'max_depth':-1,
         'boosting':'gbdt',
         'objective': 'binary',
         'metric': 'binary_logloss',
         'is_training_metric': True,
         'num_leaves':40,
         'min_data_in_leaf':20,
         'feature_fraction':0.9,
         'bagging_fraction':0.7,
         'bagging_freq':5,
         'seed':42}

model = lgb.train(params, train_ds, 1000, test_ds, verbose_eval=100, early_stopping_rounds=100)
y_pred = model.predict(x_test)

[LightGBM] [Info] Number of positive: 4400, number of negative: 3600
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151569
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 617
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550000 -> initscore=0.200671
[LightGBM] [Info] Start training from score 0.200671
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.208154
[200]	valid_0's binary_logloss: 0.0760582
[300]	valid_0's binary_logloss: 0.0287798
[400]	valid_0's binary_logloss: 0.0109601
[500]	valid_0's binary_logloss: 0.00420341
[600]	valid_0's binary_logloss: 0.00168517
[700]	valid_0's binary_logloss: 0.00065506
[800]	valid_0's binary_logloss: 0.000251634
[900]	valid_0's binary_logloss: 9.87153e-05
[1000]	valid_0's binary_logloss: 3.82843e-05
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 3.82843e-05


In [22]:
for i in range(len(y_pred)):
    if y_pred[i]>=.5:
         y_pred[i]=1
    else:
        y_pred[i]=0

In [30]:
accuracy = accuracy_score(y_pred, y_test)

accuracy

1.0