In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

# Metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report,accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import roc_curve, auc

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier 

# Model Tuning 
from bayes_opt import BayesianOptimization

# Feature Importance 
import shap

# Ignore Warnings 
import warnings
warnings.filterwarnings('ignore')

In [3]:
#!pip install lightgbm
#!pip3 install bayesian-optimization
#!pip install shap

# Import data

In [8]:
data_csv = pd.read_csv('DataSet\data.csv')

In [10]:
column_names = ["Y"]
column_names = column_names + ['X' + str(num) for num in range(1,len(data_csv.columns))]

column_names_df = pd.DataFrame({"Var_name": column_names, "Description":data_csv.columns})

data_csv.columns = column_names
data_csv.info(verbose = True, show_counts= True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y       6819 non-null   int64  
 1   X1      6819 non-null   float64
 2   X2      6819 non-null   float64
 3   X3      6819 non-null   float64
 4   X4      6819 non-null   float64
 5   X5      6819 non-null   float64
 6   X6      6819 non-null   float64
 7   X7      6819 non-null   float64
 8   X8      6819 non-null   float64
 9   X9      6819 non-null   float64
 10  X10     6819 non-null   float64
 11  X11     6819 non-null   float64
 12  X12     6819 non-null   float64
 13  X13     6819 non-null   float64
 14  X14     6819 non-null   float64
 15  X15     6819 non-null   float64
 16  X16     6819 non-null   float64
 17  X17     6819 non-null   float64
 18  X18     6819 non-null   float64
 19  X19     6819 non-null   float64
 20  X20     6819 non-null   float64
 21  X21     6819 non-null   float64
 22  

In [11]:
column_names_df.style

Unnamed: 0,Var_name,Description
0,Y,Bankrupt?
1,X1,ROA(C) before interest and depreciation before interest
2,X2,ROA(A) before interest and % after tax
3,X3,ROA(B) before interest and depreciation after tax
4,X4,Operating Gross Margin
5,X5,Realized Sales Gross Margin
6,X6,Operating Profit Rate
7,X7,Pre-tax net Interest Rate
8,X8,After-tax net Interest Rate
9,X9,Non-industry income and expenditure/revenue


# Preprocessing Data

In [14]:
for int_column in data_csv.select_dtypes(include="int64"):
    print(data_csv[int_column].value_counts())
    print("\n")



0    6599
1     220
Name: Y, dtype: int64


0    6811
1       8
Name: X85, dtype: int64


1    6819
Name: X94, dtype: int64




- select_dtypes(include="int64") -> data_csv에서 datatype이 int64인 feature를 선택한다.
- value_counts -> 각 값 별로 count해서 보여준다.

In [15]:
data_csv = data_csv.drop("X94",axis="columns")

- X94 항목은 값이 1개만 존재하기 때문에 useless하기 때문에 Drop 시킨다.

# Train-Test Split

In [16]:
X = data_csv.drop("Y",axis="columns")
y = data_csv["Y"]

In [17]:
X_train_all, X_test, y_train_all, y_test = train_test_split(X,y,test_size=0.2, random_state=42, shuffle= True, stratify = y)

In [18]:
print(X_train_all.shape, X_test.shape, y_train_all.shape, y_test.shape)

(5455, 94) (1364, 94) (5455,) (1364,)


In [20]:
import matplotlib.pyplot as plt
import itertools
import numpy as np

def plot_confusion_matrix_custom(cm,target_names,title='Confusion Matrix',cmap=None,normalize=True):
    
    accuracy = np.trace(cm)/float(np.sum(cm))
    misclass= 1-accuracy
    
    if cmap is None:
        cmap = plt.get_cmap('Blues')
        
    plt.figure(figsize=(8,6))
    plt.imshow(cm,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
    
    thresh = cm.max() / 1.5 if normalize else cm.max() /2
    
    for i,j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j,i,"{:0.4f}".format(cm[i,j]),
                    horizontalalignment="center",
                    color="white" if cm[i,j] > thresh else "black")
        else:
            plt.text(j,i,"{:,}".format(cm[i,j]),
                    horizontalalignment="center",
                    color="white" if cm[i,j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted label\naccuracy={:0,4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
    
def binarizeArray(array, threshold = 0.5):
    return [0 if num<threshold else 1 for num in array]
    

# Model

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
cv=5

In [23]:
scoring ='accuracy' # f1, accuracy, log_loss, 

In [None]:

C=np.logspace(-4,5,20)
param_grid =[{"C":C,"solver":solver}]

gs_lr = GridSearchCV(LogisticRegression(),param_grid=param_grid,cv=cv, scoring=scoring,return_train_score=True)