In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import csv
import optuna
import numpy as np

In [2]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

import random

documents = documents[:77] + random.sample(documents[-142:-15], 62) + documents[-15:]

In [3]:
# 전문가 평점 3점 이상인 상위 175개는 label에 1, 나머지 하위는 label에 0 값 부여
# 분류 예측용 label
labels = []
for i in range(154):
    if i < 77:
        labels.append(1)
    else:
        labels.append(0)

In [4]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [5]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(documents_corpus)]

In [9]:
def objective(trial):
    # 하이퍼파라미터 샘플링
    vector_size = trial.suggest_int('vector_size', 20, 100)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 5)
    epochs = trial.suggest_int('epochs', 10, 50)

    # Doc2Vec 모델 생성 및 학습
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)

# 레이블별로 코사인 유사도 평가
    label_0_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 0]
    label_1_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 1]

    # 레이블 0 문서들 간의 유사도 평균
    similarity_scores_0 = []
    for i in range(len(label_0_vectors)):
        for j in range(i + 1, len(label_0_vectors)):
            similarity_scores_0.append(cosine_similarity([label_0_vectors[i]], [label_0_vectors[j]])[0][0])
    mean_similarity_0 = np.mean(similarity_scores_0) if similarity_scores_0 else 0

    # 레이블 1 문서들 간의 유사도 평균
    similarity_scores_1 = []
    for i in range(len(label_1_vectors)):
        for j in range(i + 1, len(label_1_vectors)):
            similarity_scores_1.append(cosine_similarity([label_1_vectors[i]], [label_1_vectors[j]])[0][0])
    mean_similarity_1 = np.mean(similarity_scores_1) if similarity_scores_1 else 0

    # 두 유사도의 평균을 최종 점수로 사용
    final_similarity_score = (mean_similarity_0 + mean_similarity_1) / 2
    
    return final_similarity_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f'Best parameters: {study.best_params}')
print(f'Best similarity score: {study.best_value}')

[I 2024-06-27 19:40:41,071] A new study created in memory with name: no-name-824f3068-c2ab-4bf7-85f1-f192ce57c464
[I 2024-06-27 19:40:44,557] Trial 0 finished with value: 0.3502827286720276 and parameters: {'vector_size': 74, 'window': 10, 'min_count': 5, 'epochs': 24}. Best is trial 0 with value: 0.3502827286720276.
[I 2024-06-27 19:40:48,391] Trial 1 finished with value: 0.3355942964553833 and parameters: {'vector_size': 21, 'window': 10, 'min_count': 2, 'epochs': 29}. Best is trial 0 with value: 0.3502827286720276.
[I 2024-06-27 19:40:52,941] Trial 2 finished with value: 0.20533595979213715 and parameters: {'vector_size': 57, 'window': 9, 'min_count': 5, 'epochs': 43}. Best is trial 0 with value: 0.3502827286720276.
[I 2024-06-27 19:40:57,511] Trial 3 finished with value: 0.2333109974861145 and parameters: {'vector_size': 25, 'window': 9, 'min_count': 4, 'epochs': 44}. Best is trial 0 with value: 0.3502827286720276.
[I 2024-06-27 19:41:01,658] Trial 4 finished with value: 0.24389478

Best parameters: {'vector_size': 99, 'window': 3, 'min_count': 1, 'epochs': 10}
Best similarity score: 0.9915133118629456


In [30]:
best_params = {'vector_size': 99, 'window': 3, 'min_count': 1, 'epochs': 10}

best_vector_size = best_params['vector_size']
best_window = best_params['window']
best_min_count = best_params['min_count']
best_epochs = best_params['epochs']

best_model = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs)
best_model.build_vocab(my_tagged_corpus)
best_model.train(my_tagged_corpus, total_examples=best_model.corpus_count, epochs=best_model.epochs)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(documents_corpus, labels, test_size=0.3, random_state=42)

In [32]:
from sklearn.metrics import f1_score
# 문서 벡터 생성
train_vectors = [best_model.infer_vector(doc) for doc in X_train]
test_vectors = [best_model.infer_vector(doc) for doc in X_test]

def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'SVC', 'RandomForest', 'XGBoost', 'LightGBM'])
    
    if classifier_name == 'LogisticRegression':
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'])
        classifier_obj = LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    elif classifier_name == 'SVC':
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        classifier_obj = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)
    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        classifier_obj = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42)
    elif classifier_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
        classifier_obj = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, random_state=42)
    elif classifier_name == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
        num_leaves = trial.suggest_int('num_leaves', 20, 3000, log=True)
        min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
        classifier_obj = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, num_leaves=num_leaves, min_child_samples=min_child_samples, random_state=42)

    classifier_obj.fit(train_vectors, y_train)
    y_pred = classifier_obj.predict(test_vectors)
    score = f1_score(y_test, y_pred)
    
    return score

# Optuna로 최적의 하이퍼파라미터 찾기
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2000, n_jobs=-1)

# 최적의 하이퍼파라미터 출력
best_trial = study.best_trial
print(f'Best trial: {best_trial.value}')
print(f'Best params: {best_trial.params}')

# 최적의 하이퍼파라미터를 사용하여 최종 모델 학습
classifier_name = best_trial.params['classifier']

if classifier_name == 'LogisticRegression':
    best_model = LogisticRegression(C=best_trial.params['C'], solver=best_trial.params['solver'], max_iter=1000, random_state=42)
elif classifier_name == 'SVC':
    best_model = SVC(C=best_trial.params['C'], kernel=best_trial.params['kernel'], gamma=best_trial.params['gamma'], random_state=42)
elif classifier_name == 'RandomForest':
    best_model = RandomForestClassifier(n_estimators=best_trial.params['n_estimators'], max_depth=best_trial.params['max_depth'], min_samples_split=best_trial.params['min_samples_split'], min_samples_leaf=best_trial.params['min_samples_leaf'], random_state=42)
elif classifier_name == 'XGBoost':
    best_model = XGBClassifier(n_estimators=best_trial.params['n_estimators'], learning_rate=best_trial.params['learning_rate'], max_depth=best_trial.params['max_depth'], subsample=best_trial.params['subsample'], colsample_bytree=best_trial.params['colsample_bytree'], random_state=42)
elif classifier_name == 'LightGBM':
    best_model = LGBMClassifier(n_estimators=best_trial.params['n_estimators'], learning_rate=best_trial.params['learning_rate'], max_depth=best_trial.params['max_depth'], num_leaves=best_trial.params['num_leaves'], min_child_samples=best_trial.params['min_child_samples'], random_state=42)

best_model.fit(train_vectors, y_train)
y_pred = best_model.predict(test_vectors)
score = f1_score(y_test, y_pred)

print(f'Validation f1_score with the best model: {score}')

[I 2024-06-27 20:07:22,069] A new study created in memory with name: no-name-9fd4a4cd-b9af-4ed7-b63a-4c9ebc82bfd0
[I 2024-06-27 20:07:22,100] Trial 1 finished with value: 0.6376811594202898 and parameters: {'classifier': 'SVC', 'C': 0.005958728140017298, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1 with value: 0.6376811594202898.
[I 2024-06-27 20:07:22,332] Trial 5 finished with value: 0.46153846153846156 and parameters: {'classifier': 'SVC', 'C': 2.1616717576024977, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 1 with value: 0.6376811594202898.
[I 2024-06-27 20:07:22,335] Trial 4 finished with value: 0.6376811594202898 and parameters: {'classifier': 'LogisticRegression', 'C': 0.002941945513987425, 'solver': 'liblinear'}. Best is trial 1 with value: 0.6376811594202898.
[I 2024-06-27 20:07:22,449] Trial 3 finished with value: 0.6222222222222222 and parameters: {'classifier': 'LogisticRegression', 'C': 0.41462203166233025, 'solver': 'sag'}. Best is trial 1 with value: 0.637

Best trial: 0.8
Best params: {'classifier': 'XGBoost', 'n_estimators': 894, 'learning_rate': 0.29345153265292323, 'max_depth': 8, 'subsample': 0.6860346003798784, 'colsample_bytree': 0.8902776358145003}
Validation f1_score with the best model: 0.8


In [33]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

# 사용하지 않은 2점짜리 자소서 리스트
documents = documents[-175:-77]

In [34]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [36]:
best_params = {'vector_size': 99, 'window': 3, 'min_count': 1, 'epochs': 10}

best_vector_size = best_params['vector_size']
best_window = best_params['window']
best_min_count = best_params['min_count']
best_epochs = best_params['epochs']

best_d2vmodel = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs)
best_d2vmodel.build_vocab(my_tagged_corpus)
best_d2vmodel.train(my_tagged_corpus, total_examples=best_d2vmodel.corpus_count, epochs=best_d2vmodel.epochs)
d2v_vec = [best_d2vmodel.infer_vector(doc) for doc in documents_corpus]

In [40]:
print(1-best_model.predict(d2v_vec).mean())

0.7244897959183674


In [39]:
import pickle

with open('d2v_model.pkl', 'wb') as f:
    pickle.dump(best_d2vmodel, f)

with open('clf_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)