In [None]:
import pandas as pd
df = pd.read_csv('final_credit_dataset.csv')
df = df.drop(columns=['score', 'prediction', 'DL_pred', 'ensemble'])

In [None]:
from sklearn.model_selection import train_test_split 

data_feature = df.iloc[:, 2:]
data_label = df.iloc[:, 1]

# 데이터 인코딩

from sklearn.preprocessing import LabelEncoder

oh_encoder = LabelEncoder()
oh_encoder.fit_transform(data_label)

# 정규화(스케일링)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(data_feature)

X_train, X_test, y_train, y_test = train_test_split(data_feature, data_label, test_size=0.1, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=True)

print(X_train.shape, X_test.shape,  y_train.shape , y_test.shape)
print(X_val.shape, y_val.shape)

In [None]:
# 분류 평가함수 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score


def get_clf_evaluation(y_test, pred=None, pred_proba=None):
  confusion = confusion_matrix(y_test, pred)
  accuracy = accuracy_score(y_test, pred)
  precision = precision_score(y_test, pred)
  recall = recall_score(y_test, pred)
  f1 = f1_score(y_test,pred)

  roc=auc = roc_auc_score(y_test, pred_proba)
  print('오차행렬')
  print(confusion)

  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율 : {2:.4f}, \
        F1: {3:.4f}, AUC:{4:4f}'.format(accuracy, precision, recall, f1) )
  
from sklearn.metrics import precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline 

def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

def roc_curve_plot(y_test , pred_proba_c1):
    # 임곗값에 따른 FPR, TPR 값을 반환 받음. 
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba_c1)

    # ROC Curve를 plot 곡선으로 그림. 
    plt.plot(fprs , tprs, label='ROC')
    # 가운데 대각선 직선을 그림. 
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    # FPR X 축의 Scale을 0.1 단위로 변경, X,Y 축명 설정등   
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )'); plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()
  

In [None]:
# 분류 
# 로지스틱 회귀, KNN, DecisionTreeClassifier, RandomForestClassifier 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier 
import warnings 
warnings.filterwarnings('ignore')

lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=8)
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier(n_estimators=10000, random_state=0, max_depth=8)
vo_clf = VotingClassifier( estimators = [('LR', lr_clf), ('KNN', knn_clf), ('DT', dt_clf)], voting= 'soft')

# VotingClassfier 학습/예측/평가
vo_clf.fit(X_train, y_train)
pred_vo = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test, pred_vo)))

# RandomForest로 학습/예측/평가 
rf_clf.fit(X_train, y_train)
pred_rf = rf_clf.predict(X_test)
print('RandomForest 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test, pred_rf)))

# 개별 모델로 학습/예측/평가
classifiers = [lr_clf, knn_clf, dt_clf]
for classifier in classifiers:
  classifier.fit(X_train, y_train)
  pred = classifier.predict(X_test)
  class_name = classifier.__class__.__name__
  print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))





In [None]:
# 회귀 

from sklearn.model_selection import train_test_split 

# 레이블 인코딩 

def scoring(data):
    fix_credit = []
    for credit in data.target:
        if credit == 'AAA':
            score = 70
            fix_credit.append(score)
        elif credit == 'AA+':
            score = 67
            fix_credit.append(score)       
        elif credit == 'AA':
            score = 64
            fix_credit.append(score)
        elif credit == 'AA-':
            score = 61
            fix_credit.append(score)
        elif credit == 'A+':
            score = 58
            fix_credit.append(score)
        elif credit == 'A':
            score = 55
            fix_credit.append(score)
        elif credit == 'A-':
            score = 52
            fix_credit.append(score)
        elif credit == 'BBB+':
            score = 49
            fix_credit.append(score)
        elif credit == 'BBB':
            score = 46
            fix_credit.append(score)
        elif credit == 'BBB-':
            score = 43
            fix_credit.append(score)
        elif credit == 'BB+':
            score = 40
            fix_credit.append(score)
        elif credit == 'BB':
            score = 37
            fix_credit.append(score)
        elif credit == 'BB-':
            score = 34
            fix_credit.append(score)
        elif credit == 'B+':
            score = 31
            fix_credit.append(score)
        elif credit == 'B':
            score = 28
            fix_credit.append(score)
        elif credit == 'B-':
            score = 25
            fix_credit.append(score)
        elif credit == 'CCC+':
            score = 22
            fix_credit.append(score)
        elif credit == 'CCC':
            score = 19
            fix_credit.append(score)
        elif credit == 'CCC-':
            score = 16
            fix_credit.append(score)
        elif credit == 'CC':
            score = 13
            fix_credit.append(score)
        elif credit == 'C':
            score = 10
            fix_credit.append(score)
        elif credit == 'D':
            score = 7
            fix_credit.append(score)
        else:
            print(credit)
    return fix_credit

In [None]:
df['score'] = scoring(df)
data_feature = df.iloc[:, 2:-1]
data_label = df.iloc[:, -1]

# 피쳐 스케일링
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()
data_feature_scaled = scaler.fit_transform(data_feature)
data_feature = pd.DataFrame(data_feature_scaled, columns=data_feature.columns)

X_train = data_feature.iloc[:99, :]
X_test = data_feature.iloc[99:, :]
y_train = data_label.iloc[:99]
y_test = data_label.iloc[99:]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=121)

print(X_train.shape, X_test.shape,  y_train.shape , y_test.shape)
print(X_val.shape, y_val.shape)

In [None]:
pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error , r2_score
import numpy as np

import warnings 
warnings.filterwarnings('ignore')

#lgbm_wrapper로 학습/예측/평가

lgbm_wrapper = LGBMRegressor(num_leaves=31, max_depth= 100, subsample= 0.4, learning_rate=0.01,   n_estimators=2000)
#조기중단수행 
evals = [(X_val, y_val)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=50, eval_metric=['mse'], eval_set=evals)
preds_lgbm = lgbm_wrapper.predict(X_test)
mse_lgbm = mean_squared_error(y_test, preds_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)

print('LGBMRegressor 결과:')
print('MSE : {0:.3f} , RMSE : {1:.3F}'.format(mse_lgbm , rmse_lgbm))
print('Variance score : {0:.3f}'.format(r2_score(y_test, preds_lgbm)))











In [None]:
len(data_feature.columns)
data_feature = data_feature.rename(columns={'자기자본순이익률(ROE)':'ROE', '매출액경상이익률':'CPMS'
       , '이자보상배율':'ICM', '매출액영업이익률':'SOPR', '잉여현금흐름/총차입금(%)':'SCF/TB(%)',
       '매출액증가율 ':'SGR ', '영업현금흐름/총부채(%)':'OCF/TL(%)', '자기자본증가율':'RIEC', '영업이익증가율':'OPGR', '고정장기적합률':'FLTFR',
       '순금융비용부담률 ':'NFCBR ', '총자산증가율':'TAGR', '부채비율':'DR(%)', '매입채무 회전기일(DPO)':'DPO', '매출채권 회전기일(DSO)':'DSO',
       '금융비용부담률': 'FCBR(%)', '차입금의존도':'BD', '재고자산 회전기일(DIO)':'DIO'})


df1 = df.drop(columns=['company', 'target'])
df1 = df1.rename(columns={'자기자본순이익률(ROE)':'ROE', '매출액경상이익률':'CPMS'
       , '이자보상배율':'ICM', '매출액영업이익률':'SOPR', '잉여현금흐름/총차입금(%)':'SCF/TB(%)',
       '매출액증가율 ':'SGR ', '영업현금흐름/총부채(%)':'OCF/TL(%)', '자기자본증가율':'RIEC', '영업이익증가율':'OPGR', '고정장기적합률':'FLTFR',
       '순금융비용부담률 ':'NFCBR ', '총자산증가율':'TAGR', '부채비율':'DR(%)', '매입채무 회전기일(DPO)':'DPO', '매출채권 회전기일(DSO)':'DSO',
       '금융비용부담률': 'FCBR(%)', '차입금의존도':'BD', '재고자산 회전기일(DIO)':'DIO', 'score':'score'})

In [None]:
#칼럼별 score 영향도 조사 

import seaborn as sns

# 2개의 행과 4개의 열을 가진 subplots를 이용. axs는 4x2개의 ax를 가짐.
fig, axs = plt.subplots(figsize=(24,12) , ncols=6 , nrows=3)
lm_features = ['DR(%)', 'BD', 'FLTFR', 'ROE', 'SOPR', 'ICM', 'FCBR(%)', 'NFCBR ',
       'SGR ', 'CPMS', 'TAGR', 'OPGR', 'RIEC', 'DSO', 'DIO', 'DPO',
       'SCF/TB(%)', 'OCF/TL(%)']
for i , feature in enumerate(lm_features):
    row = int(i/6)
    col = i%6
    # 시본의 regplot을 이용해 산점도와 선형 회귀 직선을 함께 표현
    sns.regplot(x=feature , y='score',data=df1 , ax=axs[row][col])

In [None]:
# 베이지안 

from hyperopt import hp

lgbm_search_space = {'max_depth': hp.quniform('max_depth', 5, 100, 1),
                    'num_leaves': hp.quniform('num_leaves', 30, 100, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'n_estimators':hp.quniform('n_estimators', 2000, 20000, 1000),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                    'reg_lambda':hp.uniform('reg_lambda',0.5, 0.99)
               }



In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error , r2_score 
from hyperopt import STATUS_OK

def objective_func(search_space):
  lgbm_re = LGBMRegressor(n_estimators=int(search_space['n_estimators']), max_depth=int(search_space['max_depth']),
                          num_leaves=int(search_space['num_leaves']),
                          learning_rate=search_space['learning_rate'],
                          colsample_bytree=search_space['colsample_bytree'],
                          reg_lambda=search_space['reg_lambda'],
                          )
  evals = [(X_val, y_val)]
  lgbm_re.fit(X_train, y_train, early_stopping_rounds=30, eval_metric=['mse'], eval_set=evals)
  preds_lgbm = lgbm_re.predict(X_test)
  mse_lgbm = mean_squared_error(y_test, preds_lgbm)
  rmse_lgbm = np.sqrt(mse_lgbm)

  return {'loss': rmse_lgbm, 'status': STATUS_OK}  
    
    

In [None]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
            space=lgbm_search_space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수
            trials=trial_val)
print('best:', best)

In [None]:
print('colsample_bytree:{0}, learning_rate:{1}, max_depth:{2}, n_estimators:{3}, num_leaves:{4}, reg_lambda:{5}'.format(
                        round(best['colsample_bytree'], 5), round(best['learning_rate'], 5),
                        int(best['max_depth']), int(best['n_estimators']),int(best['num_leaves']), round(best['reg_lambda'], 5) )) 

In [None]:

y_train = data_label.iloc[:99]
X_train = data_feature.iloc[:99, :]


lgbm_wrapper = LGBMRegressor(colsample_bytree=round(best['colsample_bytree'], 5),
                             learning_rate=round(best['learning_rate'],5),
                             max_depth = int(best['max_depth']),
                             n_estimators = int(best['num_leaves']),
                             reg_lambda = round(best['reg_lambda'], 5)
                             )

#K-fold로 진행 

from sklearn.model_selection import KFold
import numpy as np

kfold = KFold(n_splits=5)
lr_mse_lgbm = []
lr_rmse_lgbm = []
lr_Variance_score=[]
lgb_models = []

n_iter = 0

for train_index, val_index  in kfold.split(data_feature): 
  # kfold.split( )으로 반환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
  print('-'*58)
  print(f'Fold:{n_iter}')
  
  X_train, X_val = data_feature.values[train_index], data_feature.values[val_index]
  y_train, y_val = data_label[train_index], data_label[val_index]
    
    

  evals = [(X_val, y_val)]
  lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=50, eval_metric=['mse'], eval_set=evals)

  lgb_models.append(lgbm_wrapper)
  preds_lgbm = lgbm_wrapper.predict(X_val)
  n_iter += 1

  

  mse_lgbm = mean_squared_error(y_val, preds_lgbm)
  rmse_lgbm = np.sqrt(mse_lgbm)

  lr_mse_lgbm.append(mse_lgbm)
  lr_rmse_lgbm.append(rmse_lgbm)
  lr_Variance_score.append(r2_score(y_val, preds_lgbm))
  

  
  
  print('LGBMRegressor 결과:')
  print('MSE : {0:.3f} , RMSE : {1:.3F}'.format(mse_lgbm , rmse_lgbm))
  print('Variance score : {0:.3f}'.format(r2_score(y_val, preds_lgbm)))

print('\n## 평균 rmse 정확도:', np.mean(lr_rmse_lgbm))
print('\n## 평균 r2_score 정확도 :', np.mean(lr_Variance_score))


  

In [None]:
lgbm_wrapper= lgb_models[4]

lgbm_wrapper.score(X_test, y_test)

In [None]:
pred = lgbm_wrapper.predict(X_test)
pred

In [None]:
df_test = df.iloc[99:][['company','target','score']]
df_test

In [None]:
con = pd.DataFrame()
con['score'] =y_test.values
con['pred'] = pred
con

In [None]:
df_test['pred_score']=pred
df_test

In [None]:
# 회귀 

from sklearn.model_selection import train_test_split 

# 레이블 인코딩 

def scoring_back(data):
    fix_credit = []
    for credit in data.pred_score:
        if credit >= 70:
            score = 'AAA'
            fix_credit.append(score)
        elif credit >= 67 :
            score = 'AA+'
            fix_credit.append(score)       
        elif credit >= 64:
            score = 'AA'
            fix_credit.append(score)
        elif credit >= 61:
            score = 'AA-'
            fix_credit.append(score)
        elif credit >= 58:
            score = 'A+'
            fix_credit.append(score)
        elif credit >= 55:
            score = 'A'
            fix_credit.append(score)
        elif credit >= 52:
            score = 'A-'
            fix_credit.append(score)
        elif credit >= 49 :
            score = 'BBB+'
            fix_credit.append(score)
        elif credit >= 46:
            score = 'BBB'
            fix_credit.append(score)
        elif credit >= 43:
            score = 'BBB-'
            fix_credit.append(score)
        elif credit >= 40:
            score = 'BB+'
            fix_credit.append(score)
        elif credit >= 37:
            score = 'BB'
            fix_credit.append(score)
        elif credit >= 34:
            score = 'BB-'
            fix_credit.append(score)
        elif credit >= 31:
            score = 'B+'
            fix_credit.append(score)
        elif credit == 28:
            score = 'B'
            fix_credit.append(score)
        elif credit >= 25:
            score = 'B-'
            fix_credit.append(score)
        elif credit >= 22:
            score = 'CCC+'
            fix_credit.append(score)
        elif credit >= 19:
            score = 'CCC'
            fix_credit.append(score)
        elif credit >= 16:
            score = 'CCC-'
            fix_credit.append(score)
        elif credit >= 13:
            score = 'CC'
            fix_credit.append(score)
        elif credit >= 10:
            score = 'C'
            fix_credit.append(score)
        elif credit >=  7:
            score = 'D'
            fix_credit.append(score)
        else:
            print(credit)
    return fix_credit


In [None]:
df_test.columns = ['company', 'target', 'score', 'financial_pred_score']
df_test
#df_test['pred_rank'] = scoring_back(df_test)
#df_test[['target', 'pred_rank']].to_excel('rank_predrank.xlsx')
#df_test.to_csv('model_predict_result.csv', encoding='cp949')

In [None]:
df_test.index = range(0, len(df_test))
df_test = df_test.drop(columns=['target'])



In [None]:
df_test.columns = ['company', 'financial_score', 'financial_model_pred']

df_test_financial = df_test

com_name_list = df_test_financial.company.values.tolist()

In [None]:
df_test_non = pd.read_csv('비재무모델결과.csv', encoding='cp949')
df_test_non.columns = ['company', 'target', 'non_financial_score', 'non_financial_score_pred']
#df_test_non= df_test_non.iloc[99:]
#df_test_non.index = range(0, len(df_test_non))


#df_result

df_test_non.company = df_test_non.company.str.replace('(주)', '').str.replace('(', '').str.replace(')', '')

df_test_non = df_test_non.set_index('company')
df_test_non = df_test_non.loc[com_name_list]
df_test_financial = df_test_financial.set_index('company')



In [None]:
df_result = pd.concat([df_test_non, df_test_financial], axis=1)

In [None]:
df_result['pred_score'] = df_result['financial_model_pred'] +  df_result['non_financial_score_pred']
df_result['real_score'] = df_result['financial_score'] + df_result['non_financial_score']
df_result

In [None]:
def scoring(score2):
    fix_credit = []
    for score in score2 :
        if score >= 95.45: 
            credit = 'AAA'
            fix_credit.append(credit)
        elif score >= 90.91:
            credit = 'AA+'
            fix_credit.append(credit)       
        elif score >= 86.36:
            credit = 'AA'
            fix_credit.append(credit)       
        elif score >= 81.82:
            credit = 'AA-'
            fix_credit.append(credit)       
        elif score >= 77.27 :
            credit = 'A+'
            fix_credit.append(credit)       
        elif score >= 72.73 :
            credit = 'A'
            fix_credit.append(credit)       
        elif score >= 68.18 :
            credit = 'A-'
            fix_credit.append(credit)       
        elif score >= 63.64 :
            credit = 'BBB+'
            fix_credit.append(credit) 
        elif score >= 59.09:
            credit = 'BBB'
            fix_credit.append(credit)
        elif score >= 54.55 :
            credit = 'BBB-'
            fix_credit.append(credit)
        elif score >= 50.00 :
            credit = 'BB+'
            fix_credit.append(credit)
        elif score >= 45.45 :
            credit = 'BB'
            fix_credit.append(credit)
        elif score >= 40.91 :
            credit = 'BB-'
            fix_credit.append(credit)
        elif score >= 36.36 :
            credit = 'B+'
            fix_credit.append(credit)
        elif score >= 31.82 :
            credit = 'B'
            fix_credit.append(credit)
        elif score >= 27.27 :
            credit = 'B-'
            fix_credit.append(credit)
        elif score >= 22.73 :
            credit = 'CCC+'
            fix_credit.append(credit)
        elif score >= 18.18 :
            credit = 'CCC'
            fix_credit.append(credit)
        elif score >= 13.64 :
            credit = 'CCC-'
            fix_credit.append(credit)
        elif score >= 9.09 :
            credit = 'CC'
            fix_credit.append(credit)
        elif score >= 4.55 :
            credit = 'C'
            fix_credit.append(credit)
        elif score >= 0 :
            credit = 'D'
            fix_credit.append(credit)
        else:
            print(credit)
    return fix_credit

In [None]:
df_result['pred_target'] = scoring(df_result['pred_score'])

In [None]:
df_result.rename(columns={'score':'non_financial_score','prediction':'non_financial_score_pred','pred_score':'sum_pred_score'},inplace=True)

In [None]:
df_result = df_result[['non_financial_score', 'non_financial_score_pred',
       'financial_score', 'financial_model_pred', 'sum_pred_score',
       'real_score', 'pred_target', 'target']]

In [None]:
# test_set에 대한 Loss값

from sklearn.metrics import mean_squared_error, mean_squared_log_error

Test_loss = np.sqrt(mean_squared_error(df_result.real_score,df_result.sum_pred_score))
msle = mean_squared_log_error(df_result.real_score, df_result.sum_pred_score)
print('RMSE:{0:.3f}, MSLE:{1:.3f}'.format(Test_loss, msle))

In [None]:
df_result
#df_result.to_csv('result.csv',index=False, encoding='cp949')



In [None]:
df_result.to_excel('result1.xlsx')

In [None]:
df_result