In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Config

In [None]:
csv_path = '../csv/preprocessed/7_label_blood_sugar.csv'
input_features = ['총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', 'BMI']
output_features = ['식전혈당(공복혈당)']

model = RandomForestClassifier()

param_distributions = {
    'n_estimators': stats.randint(low=10, high=100),
    'max_depth': stats.randint(low=4, high=10),
    'min_samples_leaf': stats.randint(low=8, high=18),
    'min_samples_split': stats.randint(low=8, high=20)
}

n_iter = 10

# Load Data

In [None]:
def load_data(csv_path, input_features, output_features):
    print('Reading csv file')
    df = pd.read_csv(csv_path)

    diabetes_df = df[df['식전혈당(공복혈당)'] == 1]
    no_diabetes_df = df[df['식전혈당(공복혈당)'] == 0]

    testset_size = len(diabetes_df) // 5
    
    print('Splitting trainset')
    # trainset
    # 당뇨 데이터에서 trainset 선택하기
    trainset_diabetes = diabetes_df.sample(n=4 * testset_size)
    diabetes_df = df.iloc[diabetes_df.index.drop(trainset_diabetes.index)]

    # 비당뇨 데이터에서 trainset 선택하기
    trainset_non_diabetes = no_diabetes_df.sample(n=4 * testset_size)
    no_diabetes_df = df.iloc[no_diabetes_df.index.drop(trainset_non_diabetes.index)]

    # 당뇨와 비당뇨 데이터를 합쳐 하나의 trainset 만들기
    trainset = trainset_diabetes.append(trainset_non_diabetes)
    trainset = trainset.sample(frac=1).reset_index(drop=True)

    print('Splitting testset')
    # testset
    # 당뇨 데이터에서 testset 선택하기
    testset_diabetes = diabetes_df.sample(n=testset_size)
    diabetes_df = df.iloc[diabetes_df.index.drop(testset_diabetes.index)]

    # 비당뇨 데이터에서 testset 선택하기
    testset_non_diabetes = no_diabetes_df.sample(n=testset_size)
    no_diabetes_df = df.iloc[no_diabetes_df.index.drop(testset_non_diabetes.index)]

    # 당뇨와 비당뇨 데이터를 합쳐 하나의 testset 만들기
    testset = testset_diabetes.append(testset_non_diabetes)
    testset = testset.sample(frac=1).reset_index(drop=True)

    return trainset[input_features], trainset[output_features], testset[input_features], testset[output_features]

# Random Search

In [None]:
def main(model, param_distributions, n_iter):
    random_cv = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring = 'f1',
        cv=4,
        n_jobs = 4,
        verbose = 5, 
        return_train_score = True,
        error_score='raise',
        random_state=42)

    print('Fitting random search')
    search = random_cv.fit(trainset_inputs, trainset_outputs)
    print(f'Best parameters is {search.best_params_}')

    print('== Classification Metrics ==')
    print(classification_report(testset_outputs, random_cv.predict(testset_inputs)))

    plot_confusion_matrix(estimator=random_cv.best_estimator_,
                          X=testset_inputs,
                          y_true=testset_outputs)
    
    plot_roc_curve(estimator=random_cv.best_estimator_,
                   X=testset_inputs,
                   y=testset_outputs)

# Execute

In [None]:
trainset_inputs, trainset_outputs, testset_inputs, testset_outputs = load_data(csv_path, input_features, output_features)


In [None]:
main(model, param_distributions, n_iter)