In [63]:
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours

import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')


np.random.RandomState(1)

RandomState(MT19937) at 0x28C61AF5540

In [64]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(class_weight={0:1,1:10}),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [65]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [66]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

---
# target1

In [67]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_standard_test.csv", index_col=0)

최종_col_1= ['EBIT/총자산', '현금흐름 대 자산', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '총자본회전률',
       '금융비용부담률', 'TMD', '자기자본순이익률']


x_train = train[최종_col_1]
y_train = train["target_1"]

x_test = test[최종_col_1]
y_test = test["target_1"]



In [68]:
y_train.value_counts()

0    5086
1     174
Name: target_1, dtype: int64

In [69]:
y_test.value_counts()

0    3503
1      89
Name: target_1, dtype: int64

In [70]:
train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_1],test["target_1"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])


[[161  17]
 [ 17  72]]
[[160  18]
 [ 17  72]]
[[151  27]
 [ 13  76]]
[[174   4]
 [ 35  54]]
[[173   5]
 [ 32  57]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8727,0.809,0.809,0.809,0.9284
1,LinearDiscriminantAnalysis(),0.8689,0.8,0.809,0.8045,0.9288
2,GaussianNB(),0.8502,0.7379,0.8539,0.7917,0.9252
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8539,0.931,0.6067,0.7347,0.9465
4,"XGBClassifier(base_score=None, booster=None, c...",0.8614,0.9194,0.6404,0.755,0.9357


--- 
# targer2

In [71]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_standard_test.csv", index_col=0)

train.loc[~(train['산업군'].str.contains('제조업') | train['산업군'].str.contains('건설업') | train['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(train['산업군'])
train = pd.concat([train,dummy_df],axis=1)
test.loc[~(test['산업군'].str.contains('제조업') | test['산업군'].str.contains('건설업') | test['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(test['산업군'])
test = pd.concat([test,dummy_df],axis=1)


최종_col_2= ['총자본정상영업이익률', '순운전자본비율', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '금융비용부담률', 'TMD', '매출액총이익률']


x_train = train[최종_col_2]
y_train = train["target_2"]

x_test = test[최종_col_2]
y_test = test["target_2"]



In [72]:
train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_2],test["target_2"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], test_1[최종_col_2], test_1["target_2"])



[[147  27]
 [ 18  69]]
[[147  27]
 [ 20  67]]
[[143  31]
 [ 19  68]]
[[163  11]
 [ 38  49]]
[[160  14]
 [ 33  54]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8276,0.7188,0.7931,0.7541,0.8432
1,LinearDiscriminantAnalysis(),0.8199,0.7128,0.7701,0.7403,0.8433
2,GaussianNB(),0.8084,0.6869,0.7816,0.7312,0.8356
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8123,0.8167,0.5632,0.6667,0.8521
4,"XGBClassifier(base_score=None, booster=None, c...",0.8199,0.7941,0.6207,0.6968,0.8447


----
# target3

In [73]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_standard_test.csv", index_col=0)

최종_col_3= ['EBIT/총자산', '현금흐름 대 자산', '순운전자본비율', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액',
       '총자본회전률', '금융비용부담률', '당기전기영업손익', '총자본증가율', 'TMD']


x_train = train[최종_col_3]
y_train = train["target_3"]

x_test = test[최종_col_3]
y_test = test["target_3"]



In [104]:
train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_3],test["target_3"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], test_1[최종_col_3], test_1["target_3"])

[[129  15]
 [ 11  61]]
[[130  14]
 [ 10  62]]
[[118  26]
 [  8  64]]
[[140   4]
 [ 33  39]]
[[138   6]
 [ 21  51]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8796,0.8026,0.8472,0.8243,0.9402
1,LinearDiscriminantAnalysis(),0.8889,0.8158,0.8611,0.8378,0.9564
2,GaussianNB(),0.8426,0.7111,0.8889,0.7901,0.9361
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8287,0.907,0.5417,0.6783,0.9504
4,"XGBClassifier(base_score=None, booster=None, c...",0.875,0.8947,0.7083,0.7907,0.9495


In [78]:
# 확인 
estimator= LinearDiscriminantAnalysis()
estimator.fit(train_1[최종_col_3], train_1["target_3"])
pd.DataFrame(estimator.coef_,columns=최종_col_3)

Unnamed: 0,EBIT/총자산,현금흐름 대 자산,순운전자본비율,누적수익성비율,abs(영업현금흐름-당기순이익)/매출액,총자본회전률,금융비용부담률,당기전기영업손익,총자본증가율,TMD
0,-0.546261,-0.944629,-0.13122,-0.817811,0.310797,-0.567727,0.314356,-0.451339,-0.373292,0.595246


In [101]:
estimator= LinearDiscriminantAnalysis()
estimator.fit(train_1[최종_col_3], train_1["target_3"])
scores = estimator.transform(train_1[최종_col_3])


group1_score = scores[train_1["target_3"] == 0].mean()
group2_score = scores[train_1["target_3"] == 1].mean()

In [102]:
group1_score

-0.8373358644800822

In [103]:
group2_score

1.675000482734989