In [1]:
import os
from glob import glob
import warnings
import random
warnings.filterwarnings('ignore')

import import_ipynb
import abuse_detecting_preprocessing as pp
import abuse_detecting_databuild as b

import pandas as pd
import numpy as np

import fasttext

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dense, Dropout, BatchNormalization, GRU
from tensorflow.keras.layers import LeakyReLU, Input, Bidirectional, LSTM, GlobalMaxPooling1D, concatenate, ReLU
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import callbacks
from scikeras.wrappers import KerasClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns


importing Jupyter notebook from abuse_detecting_preprocessing.ipynb
['ㅇㅡㄴㄹㅡㄹㅈㅏㄹㅇㅣ-ㄱㅣㄴㄷㅏ-ㅎㅏ-', 'ㄴㅡㄴㄹㅗㄹㅇㅢ-ㅅㅣㄴㅇㅣ-ㄷㅏ-'] 

['t1ㅇㅡㄴlckㄹㅡㄹㅈㅏㄹㅇㅣ-ㄱㅣㄴㄷㅏ-ㅎㅏ-', 'fakerㄴㅡㄴㄹㅗㄹㅇㅢ-ㅅㅣㄴㅇㅣ-ㄷㅏ-'] 

importing Jupyter notebook from abuse_detecting_databuild.ipynb




In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
SEED = 30
seed_everything(SEED)

In [3]:
def print_score(label, pred):
    print('Precision: {:.5f}'.format(precision_score(label, pred)))
    print('Recall: {:.5f}'.format(recall_score(label, pred)))
    print('F1 Score: {:.5f}'.format(f1_score(label, pred)))
    print('Accuracy : {:.5f}'.format(np.mean(label == pred)))
    print('ROC AUC Score: {:.5f}\n'.format(roc_auc_score(label, pred)))
    

In [4]:

def _1DCNN(dropout_rate, input_shape):
    
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size = 6, input_shape = input_shape))
    model.add(LeakyReLU(alpha=0.2))
    # input_shape = (25, 100)
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(dropout_rate))
    
    model.add(Conv1D(filters=128, kernel_size = 3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(dropout_rate))
    
    model.add(Conv1D(filters=256, kernel_size = 1))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization())
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(dropout_rate))  
    
    model.add(Dense(128))
    model.add(ReLU())
    model.add(Dense(1, activation ='sigmoid'))
    
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'Adam',
                  metrics = ['accuracy'])
    
    return model


In [5]:

def combined_pooling_lstm_model(dropout_rate, input_shape):
    
    inputs = Input(shape = input_shape)
    
    # Bidirectional LSTM layer
    x = Bidirectional(LSTM(64, return_sequences = True))(inputs)
    x = Bidirectional(LSTM(64, return_sequences = True))(x)
    
    # Average pool과 Max pool 생성
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    # 두 pool을 합친다.
    combined_pool = concatenate([avg_pool, max_pool])
    combined_pool = BatchNormalization()(combined_pool)
    
    # Dropout
    combined_pool = Dropout(dropout_rate)(combined_pool)
    
    # Output
    outputs = Dense(128)(combined_pool)
    outputs = ReLU()(outputs)
    
    outputs = Dense(1, activation='sigmoid')(outputs)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss = "binary_crossentropy",
                  optimizer = 'Adam',
                  metrics = ['accuracy'])
        
    return model


In [6]:

def GRU_model(dropout_rate, input_shape):
    
    model = Sequential()

    model.add(GRU(128, return_sequences = True, input_shape = input_shape))
    model.add(LeakyReLU(alpha = 0.2))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    model.add(GRU(64, return_sequences = True))
    model.add(LeakyReLU(alpha = 0.2))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    model.add(GRU(32, return_sequences = False))
    model.add(LeakyReLU(alpha = 0.2))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    # output
    model.add(Dense(32))
    model.add(ReLU())
    model.add(Dense(1, activation = 'sigmoid'))
    
    # Model compile
    model.compile(loss = 'binary_crossentropy',
                  optimizer ='Adam',
                  metrics = ['accuracy'])
    
    return model


In [7]:
def build_train(routes, input_,):
    fasttext_model = fasttext.load_model('fasttext_model.bin')
    
    df = pd.DataFrame()
    for r in routes:
        d = pd.read_csv(r)
        df = pd.concat([df,d], ignore_index = True)

    N = 5
    # df = df[['문장','악플/욕설']]
    # df = df.rename(columns = {'문장' : 'message', '악플/욕설' : 'label'})
    x_train = b.making_x_train(df['message'], input_, fasttext_model, N)
    y_train = np.array(df['label'])
    
    y_train = y_train.astype(np.float32)
    y_train = y_train.reshape(-1,1)
    
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, stratify = y_train, random_state = SEED,
                                                       train_size = 0.8, test_size = 0.2)
    print('x_train :', x_train.shape, 'y_train :', y_train.shape)
    print('x_test :', x_test.shape, 'y_test :', y_test.shape)

    return x_train, y_train, x_test, y_test

In [8]:
def train(model, train_set, k, epochs):
    x_train, y_train = train_set
    # train
    kf = KFold(n_splits = k, shuffle=True)

    for idx, (train_index, val_index) in enumerate(kf.split(x_train)):
        print(f'\n--{idx + 1}/{k} fold--\n')
        
        x_train_fold, x_val_fold = x_train[train_index], x_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        
        model.fit(x_train_fold, y_train_fold, epochs = epochs,
                  validation_data = (x_val_fold, y_val_fold), callbacks = [early_stopping])
        
    print(f'\n--{model}--\n')
    y_pred = np.round(model.predict(x_test))
    print_score(y_pred, y_test)
    
    return model

In [9]:
def my_GridSearch(model, train_set, param_grid, cv, input_shape, verbose = 0, n_jobs = 5):
    train, y = train_set

    classifier = KerasClassifier(build_fn = model, input_shape = input_shape, dropout_rate = 0.0)
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(estimator = classifier,
                           param_grid = param_grid,
                           scoring = 'roc_auc',
                           cv = cv)
    
    # 모델 fitting
    grid_model.fit(train, y)
    
    # 최적의 하이퍼파라미터 및 점수 출력
    print("Best parameters found: ", grid_model.best_params_)
    print("Best ROC AUC score: ", grid_model.best_score_)
    
    # 결과값 저장
    results = pd.DataFrame(grid_model.cv_results_)
    results = results.sort_values(by='mean_test_score', ascending=False)
    
    return results


In [10]:
def ensemble(models, test_set):
    x_test, y_test = test_set
    pred = np.column_stack([m.predict(x_test) for m in models])
    ensembled_pred = np.round(np.mean(pred, axis = 1))

    for p in pred.transpose():
        print_score(y_test, np.round(p))
    print('--ensembeld_model--')
    print_score(y_test, ensembled_pred)
    

In [27]:
K = 5
_input = 40 # 단어 길이
input_shape = (_input, 100) # fasttext 모델이 100개의 벡터로 반환함.
dropout_rate = 0.0
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5) 

data_routes = glob('chatting_datas/labeled_datas/*.csv')
# data_routes = glob('/Users/kim-yongjun/Documents/chatting_data/smile_gate_abuse_data/*')
x_train, y_train, x_test, y_test = build_train(data_routes, _input)
train_set = (x_train, y_train)
test_set = (x_test, y_test)



x_train : (3352, 40, 100) y_train : (3352, 1)
x_test : (839, 40, 100) y_test : (839, 1)


In [28]:
cv = 5 # grid_search의 cv

param_grid = {'epochs': [6, 8, 10],
              'dropout_rate': [0.2, 0.3, 0.4]
             }
# 총 90 + 120 + 150 = 360회 수행.

In [None]:
_1DCNN_grid = my_GridSearch(_1DCNN, train_set, param_grid, cv, input_shape)


In [30]:
_1DCNN_grid[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
3,1,"{'dropout_rate': 0.3, 'epochs': 6}",0.80582
6,2,"{'dropout_rate': 0.4, 'epochs': 6}",0.80168
8,3,"{'dropout_rate': 0.4, 'epochs': 10}",0.799178
7,4,"{'dropout_rate': 0.4, 'epochs': 8}",0.797957
4,5,"{'dropout_rate': 0.3, 'epochs': 8}",0.787698
0,6,"{'dropout_rate': 0.2, 'epochs': 6}",0.785576
1,7,"{'dropout_rate': 0.2, 'epochs': 8}",0.778409
2,8,"{'dropout_rate': 0.2, 'epochs': 10}",0.751344
5,9,"{'dropout_rate': 0.3, 'epochs': 10}",0.743542


In [None]:
GRU_grid = my_GridSearch(GRU_model, train_set, param_grid, cv, input_shape)


In [32]:
GRU_grid[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
5,1,"{'dropout_rate': 0.3, 'epochs': 10}",0.835426
4,2,"{'dropout_rate': 0.3, 'epochs': 8}",0.810124
8,3,"{'dropout_rate': 0.4, 'epochs': 10}",0.807032
7,4,"{'dropout_rate': 0.4, 'epochs': 8}",0.80173
3,5,"{'dropout_rate': 0.3, 'epochs': 6}",0.786926
0,6,"{'dropout_rate': 0.2, 'epochs': 6}",0.767133
2,7,"{'dropout_rate': 0.2, 'epochs': 10}",0.76708
1,8,"{'dropout_rate': 0.2, 'epochs': 8}",0.766626
6,9,"{'dropout_rate': 0.4, 'epochs': 6}",0.724909


In [None]:
lstm_grid = my_GridSearch(combined_pooling_lstm_model, train_set, param_grid, cv, input_shape)


In [34]:
# lstm_grid[['rank_test_score', 'params']]
lstm_grid[['rank_test_score', 'params', 'mean_test_score']].reset_index(drop = True)

Unnamed: 0,rank_test_score,params,mean_test_score
0,1,"{'dropout_rate': 0.4, 'epochs': 6}",0.826137
1,2,"{'dropout_rate': 0.2, 'epochs': 10}",0.817821
2,3,"{'dropout_rate': 0.3, 'epochs': 10}",0.817462
3,4,"{'dropout_rate': 0.3, 'epochs': 6}",0.812698
4,5,"{'dropout_rate': 0.4, 'epochs': 10}",0.807726
5,6,"{'dropout_rate': 0.3, 'epochs': 8}",0.807505
6,7,"{'dropout_rate': 0.2, 'epochs': 6}",0.80259
7,8,"{'dropout_rate': 0.4, 'epochs': 8}",0.797467
8,9,"{'dropout_rate': 0.2, 'epochs': 8}",0.793414


In [35]:
_1DCNN_best = _1DCNN_grid[lstm_grid['rank_test_score'] == 1]['params'].item()
GRU_best = GRU_grid[lstm_grid['rank_test_score'] == 1]['params'].item()
BiLSTM_best = lstm_grid[lstm_grid['rank_test_score'] == 1]['params'].item()

_1DCNN_model = _1DCNN(_1DCNN_best['dropout_rate'], input_shape)
_GRU_model =  GRU_model(GRU_best['dropout_rate'], input_shape)
_BiLSTM_model = combined_pooling_lstm_model(BiLSTM_best['dropout_rate'], input_shape)


In [36]:
_1DCNN_model = train(_1DCNN_model, train_set, K, _1DCNN_best['epochs'])
_GRU_model = train(_GRU_model, train_set, K, GRU_best['epochs'])
_BiLSTM_model = train(_BiLSTM_model, train_set, K, BiLSTM_best['epochs'])


--1/5 fold--

Epoch 1/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7230 - loss: 0.5377 - val_accuracy: 0.8584 - val_loss: 0.3765
Epoch 2/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8719 - loss: 0.3334 - val_accuracy: 0.8689 - val_loss: 0.4673
Epoch 3/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8925 - loss: 0.2943 - val_accuracy: 0.8793 - val_loss: 0.5030
Epoch 4/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9059 - loss: 0.2672 - val_accuracy: 0.8763 - val_loss: 0.5383
Epoch 5/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9070 - loss: 0.2409 - val_accuracy: 0.8838 - val_loss: 0.4348
Epoch 6/6
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9248 - loss: 0.2027 - val_accuracy: 0.8808 - val_loss: 0.5658

--2/5 fold--

Epoch 1/6
[1m84

In [38]:
models = [_1DCNN_model, _GRU_model, _BiLSTM_model]
pr = ensemble(models, test_set)
 

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Precision: 0.89409
Recall: 1.00000
F1 Score: 0.94408
Accuracy : 0.84180
ROC AUC Score: 0.61947

Precision: 0.92468
Recall: 0.98072
F1 Score: 0.95187
Accuracy : 0.80523
ROC AUC Score: 0.73372

Precision: 0.93127
Recall: 0.95179
F1 Score: 0.94142
Accuracy : 0.78084
ROC AUC Score: 0.75023

--ensembeld_model--
Precision: 0.91487
Recall: 0.99174
F1 Score: 0.95175
Accuracy : 0.82003
ROC AUC Score: 0.69941

