In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

#### df 경로
df = pd.read_csv('./../data/df.csv')

# 다양한 분류를 사용하여 평가한 결과

In [3]:
# 정확도, 정밀도, 재현율, F1, AUC 불러오기
def get_clf_eval(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))
    
    return [accuracy, precision, recall, F1, AUC]

train = df[['testimony','minutes','beige','speech1','speech2']].to_numpy()
label = df['label'].to_numpy()

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

dt_clf = DecisionTreeClassifier(random_state = 10)
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)

dt = get_clf_eval(y_test, pred)

lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)
vo = get_clf_eval(y_test, pred)

lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
lr = get_clf_eval(y_test, pred)

knn_clf.fit(x_train, y_train)
pred = knn_clf.predict(x_test)
knn = get_clf_eval(y_test, pred)

rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)
rf = get_clf_eval(y_test, pred)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)
pred = gbrt.predict(x_test)

gb = get_clf_eval(y_test, pred)

xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
pred = xgb_wrapper.predict(x_test)

xgb = get_clf_eval(y_test, pred)


정확도: 0.4000
정밀도: 0.0000
재현율: 0.0000
F1: 0.0000
AUC: 0.2857

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.6000
정밀도: 0.4000
재현율: 0.6667
F1: 0.5000
AUC: 0.6190

정확도: 0.6000
정밀도: 0.3333
재현율: 0.3333
F1: 0.3333
AUC: 0.5238

정확도: 0.5000
정밀도: 0.2500
재현율: 0.3333
F1: 0.2857
AUC: 0.4524

정확도: 0.6000
정밀도: 0.4000
재현율: 0.6667
F1: 0.5000
AUC: 0.6190

정확도: 0.5000
정밀도: 0.2500
재현율: 0.3333
F1: 0.2857
AUC: 0.4524




In [9]:
unstruct = pd.DataFrame({'DecisionTree':dt, 'Voting': vo, 'Logistic':lr, 'KNN':knn, 'RandomForest':rf,'GradientBoost':gb, 'XGB':xgb}, index=[['비정형','비정형','비정형','비정형','비정형'],['정확도', '정밀도', '재현율', 'F1', 'AUC']])
unstruct

Unnamed: 0,Unnamed: 1,DecisionTree,Voting,Logistic,KNN,RandomForest,GradientBoost,XGB
비정형,정확도,0.4,0.7,0.6,0.6,0.5,0.6,0.5
비정형,정밀도,0.0,0.5,0.4,0.333333,0.25,0.4,0.25
비정형,재현율,0.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333
비정형,F1,0.0,0.571429,0.5,0.333333,0.285714,0.5,0.285714
비정형,AUC,0.285714,0.690476,0.619048,0.52381,0.452381,0.619048,0.452381


In [10]:
train = df[['inf','uem']].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

dt_clf = DecisionTreeClassifier(random_state = 10)
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
dt = get_clf_eval(y_test, pred)

lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)

vo = get_clf_eval(y_test, pred)

lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)

lr = get_clf_eval(y_test, pred)

knn_clf.fit(x_train, y_train)
pred = knn_clf.predict(x_test)

knn = get_clf_eval(y_test, pred)

rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)

rf = get_clf_eval(y_test, pred)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)
pred = gbrt.predict(x_test)

gb = get_clf_eval(y_test, pred)

xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
pred = xgb_wrapper.predict(x_test)

xgb = get_clf_eval(y_test, pred)


정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.8000
정밀도: 0.6000
재현율: 1.0000
F1: 0.7500
AUC: 0.8571

정확도: 0.8000
정밀도: 0.6000
재현율: 1.0000
F1: 0.7500
AUC: 0.8571

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.9000
정밀도: 1.0000
재현율: 0.6667
F1: 0.8000
AUC: 0.8333

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.6000
정밀도: 0.4000
재현율: 0.6667
F1: 0.5000
AUC: 0.6190




In [11]:
struct = pd.DataFrame({'DecisionTree':dt, 'Voting': vo, 'Logistic':lr, 'KNN':knn, 'RandomForest':rf,'GradientBoost':gb, 'XGB':xgb}, index=[['정형','정형','정형','정형','정형'],['정확도', '정밀도', '재현율', 'F1', 'AUC']])
struct

Unnamed: 0,Unnamed: 1,DecisionTree,Voting,Logistic,KNN,RandomForest,GradientBoost,XGB
정형,정확도,0.7,0.8,0.8,0.7,0.9,0.7,0.6
정형,정밀도,0.5,0.6,0.6,0.5,1.0,0.5,0.4
정형,재현율,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.666667
정형,F1,0.571429,0.75,0.75,0.571429,0.8,0.571429,0.5
정형,AUC,0.690476,0.857143,0.857143,0.690476,0.833333,0.690476,0.619048


In [12]:
train = df[['testimony','minutes','beige','speech1','speech2', 'inf','uem']].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

dt_clf = DecisionTreeClassifier(random_state = 10)
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
dt = get_clf_eval(y_test, pred)

lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)

vo = get_clf_eval(y_test, pred)

lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)

lr = get_clf_eval(y_test, pred)

knn_clf.fit(x_train, y_train)
pred = knn_clf.predict(x_test)

knn = get_clf_eval(y_test, pred)

rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)

rf = get_clf_eval(y_test, pred)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)
pred = gbrt.predict(x_test)

gb = get_clf_eval(y_test, pred)

xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
pred = xgb_wrapper.predict(x_test)

xgb = get_clf_eval(y_test, pred)


정확도: 0.8000
정밀도: 1.0000
재현율: 0.3333
F1: 0.5000
AUC: 0.6667

정확도: 0.8000
정밀도: 0.6000
재현율: 1.0000
F1: 0.7500
AUC: 0.8571

정확도: 0.8000
정밀도: 0.6000
재현율: 1.0000
F1: 0.7500
AUC: 0.8571

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905

정확도: 0.9000
정밀도: 1.0000
재현율: 0.6667
F1: 0.8000
AUC: 0.8333

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905




In [13]:
multistruct = pd.DataFrame({'DecisionTree':dt, 'Voting': vo, 'Logistic':lr, 'KNN':knn, 'RandomForest':rf,'GradientBoost':gb, 'XGB':xgb}, index=[['정형 + 비정형','정형 + 비정형','정형 + 비정형','정형 + 비정형','정형 + 비정형'],['정확도', '정밀도', '재현율', 'F1', 'AUC']])
multistruct

Unnamed: 0,Unnamed: 1,DecisionTree,Voting,Logistic,KNN,RandomForest,GradientBoost,XGB
정형 + 비정형,정확도,0.8,0.8,0.8,0.7,0.7,0.9,0.7
정형 + 비정형,정밀도,1.0,0.6,0.6,0.5,0.5,1.0,0.5
정형 + 비정형,재현율,0.333333,1.0,1.0,0.666667,0.666667,0.666667,0.666667
정형 + 비정형,F1,0.5,0.75,0.75,0.571429,0.571429,0.8,0.571429
정형 + 비정형,AUC,0.666667,0.857143,0.857143,0.690476,0.690476,0.833333,0.690476


In [14]:
result = pd.concat([unstruct,struct,multistruct])
result

Unnamed: 0,Unnamed: 1,DecisionTree,Voting,Logistic,KNN,RandomForest,GradientBoost,XGB
비정형,정확도,0.4,0.7,0.6,0.6,0.5,0.6,0.5
비정형,정밀도,0.0,0.5,0.4,0.333333,0.25,0.4,0.25
비정형,재현율,0.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333
비정형,F1,0.0,0.571429,0.5,0.333333,0.285714,0.5,0.285714
비정형,AUC,0.285714,0.690476,0.619048,0.52381,0.452381,0.619048,0.452381
정형,정확도,0.7,0.8,0.8,0.7,0.9,0.7,0.6
정형,정밀도,0.5,0.6,0.6,0.5,1.0,0.5,0.4
정형,재현율,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.666667
정형,F1,0.571429,0.75,0.75,0.571429,0.8,0.571429,0.5
정형,AUC,0.690476,0.857143,0.857143,0.690476,0.833333,0.690476,0.619048
