In [29]:
import pandas as pd

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours

import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [30]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier()
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),2),round(roc_auc_score(y_train,y_hat),2))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),2),
                        round(precision_score(y_test,y_hat),2),
                        round(recall_score(y_test,y_hat),2),
                        round(f1_score(y_test,y_hat),2),
                        round(roc_auc_score(y_test,y_hat),2))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [31]:
# def model_basic(x_train, y_train, x_test, y_test):
#     models = [
#         LogisticRegression(),
#         LinearDiscriminantAnalysis(),
#         GaussianNB(),
#         RandomForestClassifier(),
#         XGBClassifier(),
        
#     ]

#     rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

#     for clf in models:
#         clf = clf.fit(x_train, y_train)
#         pred = clf.predict(x_test)
#         pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
#         auc_score = roc_auc_score(y_test, pred_prob_rf)
#         results = (
#             round(accuracy_score(y_test, pred), 4),
#             round(precision_score(y_test, pred), 4),
#             round(recall_score(y_test, pred), 4),
#             round(f1_score(y_test, pred), 4),
#             round(auc_score, 4)
#         )

#         rdict['model'].append(clf)
#         rdict['accuracy'].append(results[0])
#         rdict['precision'].append(results[1])
#         rdict['recall'].append(results[2])
#         rdict['f1_score'].append(results[3])
#         rdict['auc_score'].append(results[4])

#         confusion = confusion_matrix(y_test, pred)
#         print(confusion)

#     rdf = pd.DataFrame(data=rdict)
#     return rdf

In [32]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [33]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

In [34]:
def evaluate_resampling_model(x_train, y_train, x_test, y_test, method, num_iterations=10, over_ratio = 0.5 ,under_ratio=0.5,random_state=None):
    

    # 모델 성능 저장을 위한 리스트
    score = {"accuracy_scores_mean" : [] ,"f1_scores_mean" :[], "recall_scores_mean" : [] , "precision_scores_mean" : [],  "auc_mean" : []}
    std_score = {"accuracy_scores_std" : [] ,"f1_scores_std" :[], "recall_scores_std" : [] , "precision_scores_std" : [],  "auc_std" : []}  

    recall_scores=[]
    accuracy_scores=[]
    f1_scores=[]
    precision_scores=[]
    auc_scores=[]
    
    
    # 랜덤샘플러 생성
    sampler_over = BorderlineSMOTE(sampling_strategy=over_ratio, random_state=random_state)
    sampler_under = RandomUnderSampler(sampling_strategy=under_ratio, random_state=random_state)

    

    # 랜덤 샘플링 반복 수행
    for i in range(num_iterations):
        
        x_train_over, y_train_over = sampler_over.fit_resample(x_train, y_train)
        x_test_under, y_test_under = sampler_under.fit_resample(x_test, y_test)

        if method == "Logistic":
            model = LogisticRegression().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)

        if method == "LDA":
            model = LinearDiscriminantAnalysis().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        if method == "Gaussian":
            model = GaussianNB().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        #모델 성능 평가 
        accuracy = accuracy_score(y_test_under, y_pred)
        f1 = f1_score(y_test_under, y_pred)
        recall = recall_score(y_test_under, y_pred)
        precision = precision_score(y_test_under, y_pred)
        auc = roc_auc_score(y_test_under, y_pred_proba)
        
        # 리스트에 결과값 담아주기
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
        recall_scores.append(recall)
        precision_scores.append(precision)
        auc_scores.append(auc)

    

    # 모델 성능 평균 및 표준편차 계산
    accuracy_mean = np.mean(accuracy_scores)
    f1_mean = np.mean(f1_scores)
    recall_mean = np.mean(recall_scores)
    precision_mean = np.mean(precision_scores)
    auc_scores_mean = np.mean(auc_scores)


    score["accuracy_scores_mean"].append(accuracy_mean)
    score["f1_scores_mean"].append(f1_mean)
    score["recall_scores_mean"].append(recall_mean)
    score["precision_scores_mean"].append(precision_mean)
    score["auc_mean"].append(auc_scores_mean)

    accuracy_std = np.std(accuracy_scores)
    f1_std = np.std(f1_scores)
    recall_std = np.std(recall_scores)
    precision_std = np.std(precision_scores)
    auc_scores_std = np.std(auc_scores)

    std_score["accuracy_scores_std"].append(accuracy_std)
    std_score["f1_scores_std"].append(f1_std)
    std_score["recall_scores_std"].append(recall_std)
    std_score["precision_scores_std"].append(precision_std)
    std_score["auc_std"].append(auc_scores_std)
    
    print(score)
    print(std_score)

---
# target1

In [35]:
train = pd.read_csv("./datasets/통계검증완료/코스피_power_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_power_test.csv", index_col=0)


최종_col_1=['이익잉여금', '누적수익성비율', 'EBITDA마진율', '자기자본순이익률', '금융비용부담률',
       'abs(영업현금흐름-당기순이익)/매출액', '총자본증가율', '매출액총이익률']

x_train = train[최종_col_1]
y_train = train["target_1"]

x_test = test[최종_col_1]
y_test = test["target_1"]




In [36]:
y_train.value_counts()

0.0    3669
1.0      50
Name: target_1, dtype: int64

In [37]:
y_test.value_counts()

0.0    2004
1.0       9
Name: target_1, dtype: int64

In [38]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1959   45]
 [   6    3]]
[[1956   48]
 [   5    4]]
[[1839  165]
 [   3    6]]
[[1965   39]
 [   6    3]]
[[1974   30]
 [   6    3]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.99,0.82,0.97,0.06,0.33,0.11,0.66
1,LinearDiscriminantAnalysis(),0.99,0.83,0.97,0.08,0.44,0.13,0.71
2,GaussianNB(),0.97,0.89,0.92,0.04,0.67,0.07,0.79
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.98,0.07,0.33,0.12,0.66
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.98,0.09,0.33,0.14,0.66


In [39]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], x_test, y_test)

[[1914   90]
 [   2    7]]
[[1888  116]
 [   1    8]]
[[1827  177]
 [   1    8]]
[[1961   43]
 [   5    4]]
[[1951   53]
 [   6    3]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.95,0.07,0.78,0.13,0.87
1,LinearDiscriminantAnalysis(),0.97,0.97,0.94,0.06,0.89,0.12,0.92
2,GaussianNB(),0.97,0.97,0.91,0.04,0.89,0.08,0.9
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.98,0.09,0.44,0.14,0.71
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.97,0.05,0.33,0.09,0.65


In [40]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_1],test["target_1"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])


[[17  1]
 [ 2  7]]
[[17  1]
 [ 1  8]]
[[16  2]
 [ 1  8]]
[[18  0]
 [ 5  4]]
[[18  0]
 [ 6  3]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.89,0.88,0.78,0.82,0.86
1,LinearDiscriminantAnalysis(),0.97,0.97,0.93,0.89,0.89,0.89,0.92
2,GaussianNB(),0.97,0.97,0.89,0.8,0.89,0.84,0.89
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.81,1.0,0.44,0.62,0.72
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.78,1.0,0.33,0.5,0.67


In [41]:
# 반복샘플링
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8985925925925926], 'f1_scores_mean': [0.838731639146887], 'recall_scores_mean': [0.7800000000000002], 'precision_scores_mean': [0.9122540404040405], 'auc_mean': [0.9739629629629628]}
{'accuracy_scores_std': [0.031369211575081865], 'f1_scores_std': [0.04092430342914875], 'recall_scores_std': [0.015555555555555546], 'precision_scores_std': [0.0906482886840834], 'auc_std': [0.02786028624151094]}


In [42]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.924], 'f1_scores_mean': [0.8888870947422033], 'recall_scores_mean': [0.8888888888888892], 'precision_scores_mean': [0.893685780885781], 'auc_mean': [0.9713024691358025]}
{'accuracy_scores_std': [0.036651624299422356], 'f1_scores_std': [0.04679010042834448], 'recall_scores_std': [3.3306690738754696e-16], 'precision_scores_std': [0.0913672832353379], 'auc_std': [0.025689544818876538]}


--- 
# targer2

In [43]:
train = pd.read_csv("./datasets/통계검증완료/코스피_power_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_power_test.csv", index_col=0)


최종_col_2= ['이익잉여금', '누적수익성비율', '금융비용부담률', '자기자본회전률', '매출액증가율', '자기자본증가율',
       '정상영업이익증가율']

x_train = train[최종_col_2]
y_train = train["target_2"]

x_test = test[최종_col_2]
y_test = test["target_2"]



In [44]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1969   33]
 [   8    3]]
[[1963   39]
 [   8    3]]
[[1849  153]
 [   8    3]]
[[1977   25]
 [   8    3]]
[[1985   17]
 [   9    2]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.99,0.79,0.98,0.08,0.27,0.13,0.63
1,LinearDiscriminantAnalysis(),0.99,0.77,0.98,0.07,0.27,0.11,0.63
2,GaussianNB(),0.98,0.86,0.92,0.02,0.27,0.04,0.6
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.98,0.11,0.27,0.15,0.63
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.99,0.11,0.18,0.13,0.59


In [45]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], x_test, y_test)

[[1898  104]
 [   4    7]]
[[1892  110]
 [   6    5]]
[[1817  185]
 [   6    5]]
[[1948   54]
 [   8    3]]
[[1949   53]
 [   7    4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.95,0.06,0.64,0.11,0.79
1,LinearDiscriminantAnalysis(),0.97,0.97,0.94,0.04,0.45,0.08,0.7
2,GaussianNB(),0.97,0.96,0.91,0.03,0.45,0.05,0.68
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.97,0.05,0.27,0.09,0.62
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.97,0.07,0.36,0.12,0.67


In [46]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_2],test["target_2"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], test_1[최종_col_2], test_1["target_2"])

[[20  2]
 [ 4  7]]
[[20  2]
 [ 6  5]]
[[20  2]
 [ 6  5]]
[[21  1]
 [ 8  3]]
[[21  1]
 [ 7  4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.82,0.78,0.64,0.7,0.77
1,LinearDiscriminantAnalysis(),0.97,0.97,0.76,0.71,0.45,0.56,0.68
2,GaussianNB(),0.96,0.96,0.76,0.71,0.45,0.56,0.68
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.73,0.75,0.27,0.4,0.61
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.76,0.8,0.36,0.5,0.66


In [47]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8417272727272725], 'f1_scores_mean': [0.7289625367087407], 'recall_scores_mean': [0.6320909090909093], 'precision_scores_mean': [0.8689210733710734], 'auc_mean': [0.8608181818181817]}
{'accuracy_scores_std': [0.032963109301752794], 'f1_scores_std': [0.04181123970359448], 'recall_scores_std': [0.01923989484688023], 'precision_scores_std': [0.10604837872447266], 'auc_std': [0.03646840560826274]}


In [48]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.7824545454545455], 'f1_scores_mean': [0.5841707415821192], 'recall_scores_mean': [0.45454545454545436], 'precision_scores_mean': [0.8308731962481962], 'auc_mean': [0.8747355371900826]}
{'accuracy_scores_std': [0.03159312905505627], 'f1_scores_std': [0.034119638538753196], 'recall_scores_std': [1.6653345369377348e-16], 'precision_scores_std': [0.13085803940615176], 'auc_std': [0.04049953685520321]}


----
# target3 


In [49]:
test.columns

Index(['회사명', '거래소코드', '회계년도', '산업군', 'target_1', 'target_2', 'target_3',
       '현금흐름 대 자산', '현금흐름 대 매출액', '현금흐름/총부채비율', '총자본정상영업이익률', 'EBIT/총자산',
       '타인자본회전률', '이익잉여금', '당기전기영업손익', '누적수익성비율', 'EBITDA마진율', '순운전자본비율',
       '자기자본순이익률', '금융비용부담률', '자기자본회전률', '자본금회전률', '시총/총자산', '외국인지분율',
       'abs(영업현금흐름-당기순이익)/매출액', '총자본증가율', '매출액총이익률', '매출액증가율', '대주주지분율',
       '자기자본증가율', 'ROA변화율', '정상영업이익증가율'],
      dtype='object')

In [50]:
train = pd.read_csv("./datasets/통계검증완료/코스피_power_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_power_test.csv", index_col=0)

최종_col_3= ['이익잉여금', '누적수익성비율', '금융비용부담률', '자기자본회전률', '매출액증가율', '자기자본증가율',
       '정상영업이익증가율']
id_col = ['회사명', '거래소코드', '회계년도', '산업군','target_3']

x_train = train[최종_col_3]
y_train = train["target_3"]

x_test = test[최종_col_3]
y_test = test["target_3"]



In [51]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1974   33]
 [   3    3]]
[[1968   39]
 [   3    3]]
[[1854  153]
 [   3    3]]
[[1979   28]
 [   3    3]]
[[1990   17]
 [   4    2]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.99,0.79,0.98,0.08,0.5,0.14,0.74
1,LinearDiscriminantAnalysis(),0.99,0.77,0.98,0.07,0.5,0.12,0.74
2,GaussianNB(),0.98,0.86,0.92,0.02,0.5,0.04,0.71
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.98,0.1,0.5,0.16,0.74
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.99,0.11,0.33,0.16,0.66


In [58]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], x_test, y_test)

[[1901  106]
 [   0    6]]
[[1899  108]
 [   2    4]]
[[1825  182]
 [   2    4]]
[[1954   53]
 [   4    2]]
[[1956   51]
 [   3    3]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.95,0.05,1.0,0.1,0.97
1,LinearDiscriminantAnalysis(),0.97,0.97,0.95,0.04,0.67,0.07,0.81
2,GaussianNB(),0.97,0.97,0.91,0.02,0.67,0.04,0.79
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.97,0.04,0.33,0.07,0.65
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.97,0.06,0.5,0.1,0.74


In [53]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_3],test["target_3"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], test_1[최종_col_3], test_1["target_3"])


[[9 3]
 [0 6]]
[[9 3]
 [2 4]]
[[9 3]
 [2 4]]
[[11  1]
 [ 4  2]]
[[11  1]
 [ 3  3]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.98,0.83,0.67,1.0,0.8,0.88
1,LinearDiscriminantAnalysis(),0.97,0.97,0.72,0.57,0.67,0.62,0.71
2,GaussianNB(),0.96,0.96,0.72,0.57,0.67,0.62,0.71
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.72,0.67,0.33,0.44,0.62
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.78,0.75,0.5,0.6,0.71


In [54]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9647222222222224], 'f1_scores_mean': [0.9522478854478855], 'recall_scores_mean': [0.9923333333333333], 'precision_scores_mean': [0.9207892857142858], 'auc_mean': [0.97375]}
{'accuracy_scores_std': [0.043096754482368875], 'f1_scores_std': [0.05655007748388134], 'recall_scores_std': [0.034914180500192175], 'precision_scores_std': [0.09542017290738757], 'auc_std': [0.036777767287678946]}


In [55]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8531666666666669], 'f1_scores_mean': [0.7554827838827839], 'recall_scores_mean': [0.6666666666666665], 'precision_scores_mean': [0.8832904761904763], 'auc_mean': [0.9625416666666667]}
{'accuracy_scores_std': [0.04365800734044408], 'f1_scores_std': [0.05187539231183165], 'recall_scores_std': [1.1102230246251565e-16], 'precision_scores_std': [0.13071943948552728], 'auc_std': [0.043517218016422984]}


----
## target3 - 정상:부실 = 1:0.5    - boderine_smote 했을때 성능이 가장 좋음 


In [56]:
가져가는_col = id_col + 최종_col_3
train_1.to_csv('./datasets/불균형처리/boder_코스피_power_train.csv',encoding='utf-8-sig')
test[가져가는_col].to_csv('./datasets/불균형처리/코스피_power_test.csv',encoding='utf-8-sig')