In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [2]:
train_df = pd.read_csv('./datasets/train_ss.csv', index_col=0)
test_df = pd.read_csv('./datasets/test_ss.csv', index_col=0)

In [4]:
X_train = train_df[['전국스크린수','전국매출액','개봉일관객수','개봉일매출비율','배우가중치','감독가중치','등급_15','등급_청불','국내배급사','주요배급사']]
y_train = train_df[['Label']]
X_test = test_df[['전국스크린수','전국매출액','개봉일관객수','개봉일매출비율','배우가중치','감독가중치','등급_15','등급_청불','국내배급사','주요배급사']]
y_test = test_df[['Label']]

In [7]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(random_state=0),
        SVC(random_state=0),
        DecisionTreeClassifier(random_state=0),
        RandomForestClassifier(random_state=0),
        XGBClassifier(random_state=0),
        LGBMClassifier(random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [8]:
model_basic(X_train, y_train, X_test, y_test)

[[54 75]
 [56 91]]
[[52 77]
 [57 90]]
[[76 53]
 [83 64]]
[[56 73]
 [59 88]]
[[62 67]
 [62 85]]
[[59 70]
 [64 83]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(random_state=0),0.5289,0.5262,0.5254,0.5482,0.619,0.5815,0.5188
1,SVC(random_state=0),0.6022,0.5988,0.5145,0.5389,0.6122,0.5732,0.5077
2,DecisionTreeClassifier(random_state=0),1.0,1.0,0.5072,0.547,0.4354,0.4848,0.5123
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.5217,0.5466,0.5986,0.5714,0.5164
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.5326,0.5592,0.5782,0.5686,0.5294
5,LGBMClassifier(random_state=0),0.9922,0.9922,0.5145,0.5425,0.5646,0.5533,0.511


In [340]:
def eval(test,pred):
    acc = accuracy_score(test,pred)
    f1 = f1_score(test,pred)
    precision = precision_score(test,pred)
    recall = recall_score(test,pred)
    print('##############\n',confusion_matrix(test,pred),
    "\n############\n",f'acc_score: {acc}\n f1_score: {f1} \n precision: {precision} \n recall: {recall}')

## 임계값 =[] 리스트생성 후 for 구문
def get_eval_by_threshold(y_test , pred_proba_c1, thresholds=[0.4,0.41,0.42,0.43,0.44,0.45,0.]):
    # thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        eval(y_test , custom_predict)


def precision_recall_curve_plot(y_test , pred_proba_c1):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()
    

## roc curve_plot    
def roc_curve_plot(y_test,pred_proba):
    fprs, tprs, thresholds = roc_curve(y_test,pred_proba) ## 입력시 1레이블 컬럼만 추출
    
    # Roc curve를 plot 곡선으로 기름
    plt.plot(fprs,tprs,label='ROC')
    ## 가운데 대각선 직선을 그림
    plt.plot([0,1],[0,1],"k--",label="ramdom")
    
    ## fpr x축을 scale을 0.1단위로 변경 x,y축 명 설정
    start,end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR(1-sensitivity)')
    plt.ylabel('TPR(recall)')
    plt.legend()
    plt.show()

---
# 하이퍼 파라미터 튜닝
## 1. LogisticRegression

In [50]:
### LogisticRegression

from sklearn.model_selection import GridSearchCV
param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2', 'elasticnet']
}

estimator = LogisticRegression(n_jobs=-1)

In [51]:
# define grid_search
lr_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           n_jobs=-1,
                           cv=5, 
                           verbose=0
                          )

# fit with (x_train, y_train)
lr_grid_search.fit(X_train, y_train)

In [52]:
# 최적 하이퍼 파라미터 조합
lr_grid_search.best_params_

{'C': 0.001, 'penalty': 'l2'}

## 2. SVC

In [53]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.001, 0.01, 0.1, 1, 10]
}

estimator = SVC()

In [54]:
# define grid_search
svc_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           n_jobs=-1,
                           cv=5,
                           verbose=0
                          )

# fit with (x_train, y_train)
svc_grid_search.fit(X_train, y_train)

In [55]:
# 최적 하이퍼 파라미터 조합
svc_grid_search.best_params_

{'C': 10, 'kernel': 'rbf'}

## 3. DecisionTreeClassifier

In [56]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'max_depth':[2,3,4,5,6,7,8],
        'min_samples_leaf':[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
}

estimator = DecisionTreeClassifier()

In [57]:
# define grid_search
dt_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           cv=5,
                           verbose=0
                          )

# fit with (x_train, y_train)
dt_grid_search.fit(X_train, y_train)

In [58]:
# 최적 하이퍼 파라미터 조합
dt_grid_search.best_params_

{'max_depth': 3, 'min_samples_leaf': 9}

## 4. RandomForestClassifier

In [59]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'criterion': ['entropy', 'gini'],
        'max_depth': [6,7,8],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [4,5],
        'min_samples_split': [7,8],
        'n_estimators': [20]
        }

estimator = RandomForestClassifier(n_jobs=-1)

In [60]:
# define grid_search
rf_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid,
                           n_jobs=-1, 
                           cv=5,
                           verbose=0
                          )

# fit with (x_train, y_train)
rf_grid_search.fit(X_train, y_train)

In [61]:
# 최적 하이퍼 파라미터 조합
rf_grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 7,
 'n_estimators': 20}

## 5. XGBClassifier

In [62]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'n_estimators':[40,43,46,50],
        'learning_rate':[0.005,0.008,0.01,0.03],
        'max_depth' : [1,2,3,4]
}

estimator = XGBClassifier(n_jobs=-1)

In [63]:
# define grid_search
xgb_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           cv=5,
                           verbose=0
                          )

# fit with (x_train, y_train)
xgb_grid_search.fit(X_train, y_train)

In [64]:
# 최적 하이퍼 파라미터 조합
xgb_grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50}

## 6. LGBMClassifier

In [65]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'n_estimators':[40,43,46,50],
        'learning_rate':[0.005,0.008,0.01,0.03],
        'max_depth' : [1,2,3,4]
}

estimator = LGBMClassifier(n_jobs=-1)

In [66]:
# define grid_search
lgbm_grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           cv=5,
                           verbose=0
                          )

# fit with (x_train, y_train)
lgbm_grid_search.fit(X_train, y_train)

In [67]:
# 최적 하이퍼 파라미터 조합
print(lr_grid_search.best_params_)
print(svc_grid_search.best_params_)
print(dt_grid_search.best_params_)
print(rf_grid_search.best_params_)
print(xgb_grid_search.best_params_)
print(lgbm_grid_search.best_params_)

{'C': 0.001, 'penalty': 'l2'}
{'C': 10, 'kernel': 'rbf'}
{'max_depth': 3, 'min_samples_leaf': 9}
{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 7, 'n_estimators': 20}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50}
{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 50}


# 하이퍼 파라미터 끝
---

In [68]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(C = 0.001, penalty = 'l2', random_state=0),
        SVC(C = 10, kernel = 'rbf', random_state=0),
        DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 9, random_state=0),
        RandomForestClassifier(random_state=0, criterion = 'entropy', max_depth = 8, max_features = 'auto', min_samples_leaf = 4,
                                min_samples_split = 7, n_estimators = 20),
        XGBClassifier(learning_rate = 0.01, max_depth = 2, n_estimators = 50, random_state=0),
        LGBMClassifier(learning_rate = 0.01, max_depth = 4, n_estimators = 50, random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [70]:
model_basic(X_train, y_train, X_test, y_test)

[[  1 128]
 [  2 145]]
[[66 63]
 [63 84]]
[[108  21]
 [127  20]]
[[54 75]
 [59 88]]
[[ 26 103]
 [ 37 110]]
[[ 38  91]
 [ 42 105]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,"LogisticRegression(C=0.001, random_state=0)",0.5211,0.502,0.529,0.5311,0.9864,0.6905,0.4971
1,"SVC(C=10, random_state=0)",0.6459,0.6449,0.5435,0.5714,0.5714,0.5714,0.5415
2,"DecisionTreeClassifier(max_depth=3, min_sample...",0.5569,0.5696,0.4638,0.4878,0.1361,0.2128,0.4866
3,"(DecisionTreeClassifier(criterion='entropy', m...",0.8222,0.8191,0.5145,0.5399,0.5986,0.5677,0.5086
4,"XGBClassifier(base_score=None, booster=None, c...",0.5881,0.5776,0.4928,0.5164,0.7483,0.6111,0.4749
5,"LGBMClassifier(learning_rate=0.01, max_depth=4...",0.6568,0.6498,0.5181,0.5357,0.7143,0.6122,0.5044


## 최적모델
* LGBM