In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix,roc_auc_score
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [2]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        DecisionTreeClassifier(),
        SVC(probability=True),
        RandomForestClassifier(),
        XGBClassifier(),
        KNeighborsClassifier(),
        LGBMClassifier()
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf


In [3]:
def eval(test,pred):
    acc = accuracy_score(test,pred)
    f1 = f1_score(test,pred)
    precision = precision_score(test,pred)
    recall = recall_score(test,pred)
    print('##############\n',confusion_matrix(test,pred),
    "\n############\n",f'acc_score: {acc}\n f1_score: {f1} \n precision: {precision} \n recall: {recall}')


In [6]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)

map = {'보통주식비율':'대주주지분율'}
train.rename(columns = map,inplace=True)
test.rename(columns = map,inplace=True)


list_int = ['대주주지분변화분', '외국인지분분변화', '자산', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율','매출액순이익률', '총자본순이익률', '자기자본순이익률',
       '매출액증가율', '광고선전비대매출액비율', '비유동비율', '당좌비율', '현금비율', '부채비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '1인년간평균급여(직원)', '토빈Q', 'WW지수',
       'RDS', '업력', '산업평균_총자산증가율차분', '대주주지분율', '외국인_주식분포비율']
x_train = train[list_int]
y_train = train[['target']]

x_test = test[list_int]
y_test = test[['target']]



col_int = ['유동자산증가율', '자기자본증가율', '산업평균_총자산증가율차분',
       '정상영업이익증가율', '매출액순이익률', '자기자본순이익률','투자집중도',
         '유보액대납입자본배율', '1인년간평균급여(직원)', '토빈Q',
         '업력', '대주주지분율', '외국인_주식분포비율']

x_train = x_train[col_int]
y_train = y_train[['target']]

x_test = x_test[col_int]
y_test = y_test[['target']]

In [7]:
print(x_train.shape,x_test.shape)

(8368, 13) (1539, 13)


In [8]:
model_basic(x_train,y_train,x_test,y_test)

[[729 197]
 [345 268]]
[[637 289]
 [312 301]]
[[859  67]
 [477 136]]
[[799 127]
 [354 259]]
[[724 202]
 [316 297]]
[[720 206]
 [338 275]]
[[751 175]
 [290 323]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.6478,0.5763,0.4372,0.4972,0.7005
1,DecisionTreeClassifier(),0.6095,0.5102,0.491,0.5004,0.5895
2,SVC(probability=True),0.6465,0.67,0.2219,0.3333,0.6795
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.6875,0.671,0.4225,0.5185,0.7548
4,"XGBClassifier(base_score=None, booster=None, c...",0.6634,0.5952,0.4845,0.5342,0.7181
5,KNeighborsClassifier(),0.6465,0.5717,0.4486,0.5027,0.6616
6,LGBMClassifier(),0.6979,0.6486,0.5269,0.5815,0.7452


In [11]:
# LGBM은 컬럼명에 특수문자 있으면 안돌아가서 rename해줌
X_train_re = x_train.copy()
X_test_re = x_test.copy()
X_train_re.columns = list(range(1,len(x_train.columns)+1))
X_test_re.columns = list(range(1,len(x_train.columns)+1))

In [21]:
### stacking
# 개별모델 내부에서 CV 적용해 Stacking하는 함수 구현(k=5)
from sklearn.model_selection import KFold

def get_stacking_datasets(model):

    # 최종 메타 모델이 사용할 학습 데이터 반환을 위해서 넘파이 배열을 0으로 만들어서 초기화
    train_fold_pred = np.zeros((X_train_re.shape[0], 1)) # 2차원으로
    test_pred = np.zeros((X_test_re.shape[0], 5)) # n_splits=5
    print(model.__class__.__name__, '모델 시작')
    
    for folder_counter, (train_idx, valid_idx) in enumerate(KFold(n_splits=5, shuffle=True, random_state=42).split(X_train_re)):
        # 개별 모델 내부에서 학습하고 1개의 fold로 예측할 데이터 셋 추출
        print(f" Fold 횟수 : {folder_counter+1}")
        X_train_ = X_train_re.iloc[train_idx]
        y_train_ = y_train.iloc[train_idx]
        X_test_ = X_train_re.iloc[valid_idx]
        
        # 개별 모델이 학습한 후 1개의 fold데이터셋으로 예측값 반환 후 최종 메타모델이 학습할 데이터셋에 첨가
        model.fit(X_train_, y_train_)
        train_fold_pred[valid_idx, :] = model.predict(X_test_).reshape(-1,1)
        # 개별 모델이 원본 데이터셋의 검증 데이터셋을 기반으로 예측 결과값 반환 후 최종 메타모델이 검증할 데이터셋에 첨가
        test_pred[:, folder_counter] = model.predict(X_test_re)
    
    # 개별모델안에서 테스트 데이터셋을 기반으로 예측한 결과값들 mean취해주고 2차원으로 바꾸어주기
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [23]:
model_LR = LogisticRegression()
model_DT = DecisionTreeClassifier()
model_RF = RandomForestClassifier()
model_LGBM = LGBMClassifier()
model_XGB = XGBClassifier()

LR_train, LR_test = get_stacking_datasets(model_LR)
RF_train, RF_test = get_stacking_datasets(model_RF)
LGBM_train, LGBM_test = get_stacking_datasets(model_LGBM)
XGB_train, XGB_test = get_stacking_datasets(model_XGB)

new_X_train = np.concatenate((LR_train, RF_train, LGBM_train), axis = 1)
new_X_test = np.concatenate(( LR_test, RF_test, LGBM_test), axis = 1)

# meta learner
model_XGB.fit(new_X_train, y_train) # 최종모델 
y_test = pd.DataFrame(y_test)
y_hat_train = pd.DataFrame(model_XGB.predict(new_X_train))
y_hat = pd.DataFrame(model_XGB.predict(new_X_test))

LogisticRegression 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
RandomForestClassifier 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
LGBMClassifier 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
XGBClassifier 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5


In [24]:
from sklearn.metrics import accuracy_score, recall_score, precision_score , f1_score, roc_auc_score, confusion_matrix

rdict={'model':[], "acc_train":[], "auc_train":[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}

results_train  = (round(accuracy_score(y_train,y_hat_train),2),round(roc_auc_score(y_train,y_hat_train),2))

results = (round(accuracy_score(y_test,y_hat),2),
                round(precision_score(y_test,y_hat),2),
                round(recall_score(y_test,y_hat),2),
                round(f1_score(y_test,y_hat),2),
                round(roc_auc_score(y_test,y_hat),2))

rdict['model'].append("model"); 
rdict['acc_train'].append(results_train[0])
rdict['auc_train'].append(results_train[1])
rdict['acc_test'].append(results[0])
rdict['precision'].append(results[1])
rdict['recall'].append(results[2])
rdict['f1_score'].append(results[3])
rdict['AUC_test'].append(results[4])

rdf_stacking = pd.DataFrame(data=rdict)
rdf_stacking

Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,model,0.75,0.63,0.69,0.68,0.43,0.53,0.65


## GLM

In [6]:
map = {'1인년간평균급여(직원)' : '직원년간평균급여',}
train.rename(columns = map,inplace=True)

In [7]:
train.columns

Index(['대주주지분변화분', '외국인지분분변화', '자산', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율', '순이익증가율', '총포괄이익증가율', '매출액순이익률', '총자본순이익률', '자기자본순이익률',
       '매출액증가율', '광고선전비대매출액비율', '비유동비율', '당좌비율', '현금비율', '부채비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '직원년간평균급여', '토빈Q', 'WW지수', 'RDS',
       '업력', '산업평균_총자산증가율차분', '대주주지분율', '외국인_주식분포비율', '도입기', '성숙기', '성장기',
       '쇠퇴기', 'target'],
      dtype='object')

In [8]:
formula = 'target ~ 유동자산증가율+자기자본증가율+투자집중도+정상영업이익증가율+매출액순이익률+자기자본순이익률+산업평균_총자산증가율차분+유보액대납입자본배율+직원년간평균급여+토빈Q+업력+대주주지분율+외국인_주식분포비율'

In [9]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
model = smf.glm(formula, train,family=sm.families.Binomial()).fit()

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,8368.0
Model:,GLM,Df Residuals:,8354.0
Model Family:,Binomial,Df Model:,13.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4469.1
Date:,"Sat, 25 Mar 2023",Deviance:,8938.2
Time:,17:33:11,Pearson chi2:,8100.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.1255
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.5699,0.128,-27.961,0.000,-3.820,-3.320
유동자산증가율,0.3190,0.153,2.078,0.038,0.018,0.620
자기자본증가율,-0.6746,0.180,-3.749,0.000,-1.027,-0.322
투자집중도,-0.4658,0.132,-3.534,0.000,-0.724,-0.207
정상영업이익증가율,1.1359,0.114,9.984,0.000,0.913,1.359
매출액순이익률,0.7786,0.211,3.690,0.000,0.365,1.192
자기자본순이익률,2.4406,0.243,10.045,0.000,1.964,2.917
산업평균_총자산증가율차분,0.2114,0.168,1.259,0.208,-0.118,0.540
유보액대납입자본배율,0.6248,0.104,6.016,0.000,0.421,0.828
