In [1]:
import numpy as np
import pandas as pd
                                                            from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm.notebook import tqdm
import os
import re
import pickle
from konlpy.tag import Mecab

In [2]:

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^가-힣0-9]', ' ', text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # 한국어 불용어 리스트 - 년, 또, 그런, 좀, 잘, 개, 아니, 씨, 안, 다시, 못하, 문제, 사람, 때문
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '주', 
        '등', '같', '우리', '때', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', 
        '그것', '두', '말하', '알', '그러나', '받', '일', '더', '사회', 
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '시키', '만들', '지금', '생각하', 
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '어떤', '내', '경우',
        '명', '생각', '시간', '그녀', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '원', '통하', '소리', '놓'
    ]
    
    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)
    
    return text

In [3]:
# Define the conversation column names for each file based on the earlier inspection
conversation_columns = {
    "./data/train_sr_cleaned.csv": "conversation_sr_cleaned",
    "./data/train_augmented_wv_.csv": "conversation",
    "./data/LLaMa2_Augmentation_trian.csv": "conversation",
    "./data/train.csv": "conversation",
    "./data/dominant_tone_data_transformers.csv": "conversation",
    "./data/dominant_tone_data_transformers.csv": "conversation"
}

# Load and preprocess the data from each file, then concatenate them
all_dataframes = []

for file_path, conv_column in conversation_columns.items():
    df_temp = pd.read_csv(file_path)
    df_temp = df_temp[['class', conv_column]]
    df_temp.columns = ['class', 'conversation']  # Renaming columns for uniformity
    df_temp['conversation'] = df_temp['conversation'].apply(clean_text)
    all_dataframes.append(df_temp)

# Concatenate all the dataframes
merged_data = pd.concat(all_dataframes, ignore_index=True)

test = pd.read_json('./data/test.json').transpose()

# train 데이터의 텍스트 열 정규화
merged_data['conversation'] = merged_data['conversation'].apply(clean_text)
test['conversation'] = test['text'].apply(clean_text)

# 지정된 클래스를 숫자로 인코딩
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}
merged_data['label_encoded'] = merged_data['class'].map(label_dict)

merged_data.drop_duplicates(subset=['conversation'], inplace=True)


# NaN 값을 가진 행 제거
merged_data.dropna(subset=['conversation', 'label_encoded'], inplace=True)

# 빈 문자열 값을 가진 행 제거
merged_data = merged_data[merged_data['conversation'] != ""]

merged_data.reset_index(drop=True, inplace=True)


In [4]:
# Mecab 토큰화
mecab = Mecab()
tokenizer = lambda text: mecab.morphs(text)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=6000)
all_data_tfidf = tfidf_vectorizer.fit_transform(merged_data['conversation'])

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

models_params = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100,200,250],
            'max_depth': [None, 1, 2],
            'min_samples_split': [2, 3, 4]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(solver='liblinear', max_iter=1500),
        'params': {
            'C': [50, 100, 150],
            'penalty': ['l1', 'l2']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [6, 7, 8]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100, 500],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 4, 8, 9, 12],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    }
}


In [6]:
from sklearn.metrics import f1_score

results = {}

X = all_data_tfidf  
y = merged_data['label_encoded'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
test_tfidf = tfidf_vectorizer.transform(test['conversation'])
answer_df = pd.read_csv("./data/answer.csv")

total_models = len(models_params)
for idx, (model_name, mp) in enumerate(models_params.items(), 1):
    print(f"\nProcessing Model {idx}/{total_models}: {model_name}")
    
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, verbose=2, scoring='f1_macro')
    clf.fit(X_train, y_train)
    
    best_model = clf.best_estimator_
    y_pred = best_model.predict(X_val)
    
    val_accuracy = accuracy_score(y_val, y_pred)
    val_f1_score = f1_score(y_val, y_pred, average='macro')  # Calculate F1 score
    
    class_report = classification_report(y_val, y_pred)
    
    print(f"\nModel: {model_name} with {clf.best_params_}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation F1 Score: {val_f1_score:.4f}")  # Print F1 Score
    
    results[(model_name, 'validation', str(clf.best_params_))] = {
        'accuracy': val_accuracy,
        'f1_score': val_f1_score,
        'report': class_report,
        'best_params': clf.best_params_
    }
    
    # test.csv에 대한 예측 수행
    y_pred_test = best_model.predict(test_tfidf)
    
    # 예측 결과와 answer.csv의 정답을 비교하여 정확도 계산
    test_accuracy = accuracy_score(answer_df['class'], y_pred_test)
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    results[(model_name, 'test', str(clf.best_params_))] = {
        'accuracy': test_accuracy,
        'best_params': clf.best_params_
    }
    
    # Save the model if accuracy is above threshold
    if test_accuracy >= 0.85:
        params_str = "_".join([f"{k}={v}" for k, v in clf.best_params_.items()])
        model_save_path = f'{model_name}_{params_str}_{test_accuracy}.pkl'
        with open(model_save_path, 'wb') as f:
            pickle.dump(best_model, f)



Processing Model 1/5: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=  17.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=  17.0s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=  16.9s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=  17.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=  16.8s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  34.0s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  34.0s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  33.9s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  34.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  33.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=250; total ti

[CV] END .max_depth=2, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END .max_depth=2, min_samples_split=2, n_estimators=250; total time=   1.4s
[CV] END .max_depth=2, min_samples_split=2, n_estimators=250; total time=   1.4s
[CV] END .max_depth=2, min_samples_split=2, n_estimators=250; total time=   1.4s
[CV] END .max_depth=2, min_samples_split=2, n_estimators=250; total time=   1.4s
[CV] END .max_depth=2, min_samples_split=2, n_estimators=250; total time=   1.3s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=100; total time=   0.6s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=100; total time=   0.6s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=100; total time=   0.6s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=100; total time=   0.6s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=100; total time=   0.6s
[CV] END .max_depth=2, min_samples_split=3, n_estimators=200; total time=   1.1s
[CV] END .max_depth=2, min_s

[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=150; total time= 8.1min
[CV] END ...learning_rate=0.01, max_depth=8, n_estimators=50; total time= 3.2min
[CV] END ...learning_rate=0.01, max_depth=8, n_estimators=50; total time= 3.1min
[CV] END ...learning_rate=0.01, max_depth=8, n_estimators=50; total time= 3.1min
[CV] END ...learning_rate=0.01, max_depth=8, n_estimators=50; total time= 3.1min
[CV] END ...learning_rate=0.01, max_depth=8, n_estimators=50; total time= 3.1min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=100; total time= 6.3min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=100; total time= 6.2min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=100; total time= 6.3min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=100; total time= 6.4min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=100; total time= 6.3min
[CV] END ..learning_rate=0.01, max_depth=8, n_estimators=150; total time= 9.6min
[CV] END ..learning_rate=0.0

[CV] END ...learning_rate=0.1, max_depth=8, n_estimators=150; total time= 8.3min
[CV] END ...learning_rate=0.1, max_depth=8, n_estimators=150; total time= 8.3min
[CV] END ...learning_rate=0.1, max_depth=8, n_estimators=150; total time= 8.3min
[CV] END ...learning_rate=0.1, max_depth=8, n_estimators=150; total time= 8.3min

Model: GradientBoosting with {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 150}
Validation Accuracy: 0.9164
Validation F1 Score: 0.9169
Test Accuracy: 0.8400

Processing Model 4/5: SVM
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time= 3.8min
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time= 3.8min
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time= 3.8min
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time= 3.8min
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time= 3.8min
[CV] END .....

[CV] END ...................C=100, gamma=auto, kernel=linear; total time=  59.0s
[CV] END ......................C=100, gamma=auto, kernel=rbf; total time= 3.2min
[CV] END ......................C=100, gamma=auto, kernel=rbf; total time= 3.2min
[CV] END ......................C=100, gamma=auto, kernel=rbf; total time= 3.2min
[CV] END ......................C=100, gamma=auto, kernel=rbf; total time= 3.2min
[CV] END ......................C=100, gamma=auto, kernel=rbf; total time= 3.2min
[CV] END ..................C=500, gamma=scale, kernel=linear; total time=  58.9s
[CV] END ..................C=500, gamma=scale, kernel=linear; total time=  58.4s
[CV] END ..................C=500, gamma=scale, kernel=linear; total time=  58.4s
[CV] END ..................C=500, gamma=scale, kernel=linear; total time=  58.3s
[CV] END ..................C=500, gamma=scale, kernel=linear; total time=  58.1s
[CV] END .....................C=500, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END ...................

[CV] END ...metric=manhattan, n_neighbors=8, weights=uniform; total time=  19.0s
[CV] END ...metric=manhattan, n_neighbors=8, weights=uniform; total time=  19.6s
[CV] END ..metric=manhattan, n_neighbors=8, weights=distance; total time=  19.3s
[CV] END ..metric=manhattan, n_neighbors=8, weights=distance; total time=  19.4s
[CV] END ..metric=manhattan, n_neighbors=8, weights=distance; total time=  19.7s
[CV] END ..metric=manhattan, n_neighbors=8, weights=distance; total time=  19.3s
[CV] END ..metric=manhattan, n_neighbors=8, weights=distance; total time=  19.5s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=  19.6s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=  19.6s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=  19.8s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=  19.7s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=  19.7s
[CV] END ..metric=manhattan,

results = {}

X = all_data_tfidf  
y = merged_data['label_encoded'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
test_tfidf = tfidf_vectorizer.transform(test['conversation'])
answer_df = pd.read_csv("./data/answer.csv")

total_models = len(models_params)
for idx, (model_name, mp) in enumerate(models_params.items(), 1):
    print(f"\nProcessing Model {idx}/{total_models}: {model_name}")
    
    # scoring parameter to 'f1_macro'
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, verbose=2, scoring='f1_macro')
    clf.fit(X_train, y_train)
    
    best_model = clf.best_estimator_
    y_pred = best_model.predict(X_val)
    
    val_accuracy = accuracy_score(y_val, y_pred)
    class_report = classification_report(y_val, y_pred)
    
    print(f"\nModel: {model_name} with {clf.best_params_}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    results[(model_name, 'validation', str(clf.best_params_))] = {
        'accuracy': val_accuracy,
        'report': class_report,
        'best_params': clf.best_params_
    }
    
    # test.csv에 대한 예측 수행
    y_pred_test = best_model.predict(test_tfidf)
    
    # 예측 결과와 answer.csv의 정답을 비교하여 정확도 계산
    test_accuracy = accuracy_score(answer_df['class'], y_pred_test)
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    results[(model_name, 'test', str(clf.best_params_))] = {
        'accuracy': test_accuracy,
        'best_params': clf.best_params_
    }
    
    # Save the model if accuracy is above threshold
    if test_accuracy >= 0.85:
        params_str = "_".join([f"{k}={v}" for k, v in clf.best_params_.items()])
        model_save_path = f'{model_name}_{params_str}.pkl'
        with open(model_save_path, 'wb') as f:
            pickle.dump(best_model, f)
