In [37]:
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours

import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [38]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(class_weight={0:1,1:10}),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [39]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [40]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

---
# target1

In [41]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standar_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standar_test.csv", index_col=0)


최종_col_1=['총자본정상영업이익률', 'EBIT/총자산', 'TMD', '누적수익성비율', '금융비용부담률', '자기자본순이익률',
       '정상영업이익증가율']


x_train = train[최종_col_1]
y_train = train["target_1"]

x_test = test[최종_col_1]
y_test = test["target_1"]




In [42]:
y_train.value_counts()

0.0    2753
1.0      41
Name: target_1, dtype: int64

In [43]:
y_test.value_counts()

0.0    2004
1.0       9
Name: target_1, dtype: int64

In [44]:
train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.1)

test_1 =under(test[최종_col_1],test["target_1"], method='random', sampling_strategy=0.1)

model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])


[[85  5]
 [ 1  8]]
[[86  4]
 [ 3  6]]
[[75 15]
 [ 1  8]]
[[86  4]
 [ 2  7]]
[[86  4]
 [ 2  7]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9394,0.6154,0.8889,0.7273,0.9704
1,LinearDiscriminantAnalysis(),0.9293,0.6,0.6667,0.6316,0.9679
2,GaussianNB(),0.8384,0.3478,0.8889,0.5,0.9309
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9394,0.6364,0.7778,0.7,0.9796
4,"XGBClassifier(base_score=None, booster=None, c...",0.9394,0.6364,0.7778,0.7,0.9741


--- 
# targer2

In [45]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standar_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standar_test.csv", index_col=0)


최종_col_2= ['총자본정상영업이익률', 'TMD', '누적수익성비율', '부채비율', '금융비용부담률', '자기자본순이익률']

x_train = train[최종_col_2]
y_train = train["target_2"]

x_test = test[최종_col_2]
y_test = test["target_2"]



In [46]:
train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.1)

test_1 =under(test[최종_col_2],test["target_2"], method='random', sampling_strategy=0.1)



model_basic(train_1[최종_col_2], train_1["target_2"], test_1[최종_col_2], test_1["target_2"])



[[102   8]
 [  5   6]]
[[106   4]
 [  5   6]]
[[97 13]
 [ 4  7]]
[[107   3]
 [  5   6]]
[[107   3]
 [  5   6]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8926,0.4286,0.5455,0.48,0.8066
1,LinearDiscriminantAnalysis(),0.9256,0.6,0.5455,0.5714,0.7785
2,GaussianNB(),0.8595,0.35,0.6364,0.4516,0.7851
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9339,0.6667,0.5455,0.6,0.7752
4,"XGBClassifier(base_score=None, booster=None, c...",0.9339,0.6667,0.5455,0.6,0.7764


----
# target3

In [47]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standar_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standar_test.csv", index_col=0)

최종_col_3= ['총자본정상영업이익률', 'TMD', '누적수익성비율', '부채비율', '금융비용부담률', '자기자본순이익률']

x_train = train[최종_col_3]
y_train = train["target_3"]

x_test = test[최종_col_3]
y_test = test["target_3"]



In [48]:
train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.1)

test_1 =under(test[최종_col_3],test["target_3"], method='random', sampling_strategy=0.1)



model_basic(train_1[최종_col_3], train_1["target_3"], test_1[최종_col_3], test_1["target_3"])



[[57  3]
 [ 0  6]]
[[58  2]
 [ 1  5]]
[[56  4]
 [ 0  6]]
[[58  2]
 [ 1  5]]
[[59  1]
 [ 1  5]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9545,0.6667,1.0,0.8,0.9833
1,LinearDiscriminantAnalysis(),0.9545,0.7143,0.8333,0.7692,0.9833
2,GaussianNB(),0.9394,0.6,1.0,0.75,0.9736
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9545,0.7143,0.8333,0.7692,0.9889
4,"XGBClassifier(base_score=None, booster=None, c...",0.9697,0.8333,0.8333,0.8333,0.9833
