In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import csv
import random
import optuna

In [2]:
# 셀에서 출력 정확도 설정.
%precision %.3f

'%.3f'

In [3]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])
        
# documents_5 = documents[:22]
# documents_4 = documents[22:77]
# documents_3 = documents[77:175]
# documents_2 = documents[175:302]
# documents_1 = documents[302:]

# documents = random.sample(documents_5, 15) + random.sample(documents_4, 15) + random.sample(documents_3, 15) + random.sample(documents_2, 15) + documents_1

In [4]:
# 전문가 평점 1~5점 각 15개씩 랜덤으로 가져와서 점수 라벨링
# 회귀 예측용 label
labels = []
for i in range(317):
    if i < 22:
        labels.append(5)
    elif i < 77:
        labels.append(4)
    elif i < 175:
        labels.append(3)
    elif i < 302:
        labels.append(2)        
    else:
        labels.append(1)

In [5]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [6]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(documents_corpus)]

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def objective(trial):
    # 하이퍼파라미터 샘플링
    vector_size = trial.suggest_int('vector_size', 20, 100)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 5)
    epochs = trial.suggest_int('epochs', 10, 50)

    # Doc2Vec 모델 생성 및 학습
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)

# 레이블별로 코사인 유사도 평가
    label_1_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 1]
    label_2_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 2]
    label_3_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 3]
    label_4_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 4]
    label_5_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 5]

    # 레이블 1 문서들 간의 유사도 평균
    similarity_scores_1 = []
    for i in range(len(label_1_vectors)):
        for j in range(i + 1, len(label_1_vectors)):
            similarity_scores_1.append(cosine_similarity([label_1_vectors[i]], [label_1_vectors[j]])[0][0])
    mean_similarity_1 = np.mean(similarity_scores_1) if similarity_scores_1 else 0

    # 레이블 2 문서들 간의 유사도 평균
    similarity_scores_2 = []
    for i in range(len(label_2_vectors)):
        for j in range(i + 1, len(label_2_vectors)):
            similarity_scores_2.append(cosine_similarity([label_2_vectors[i]], [label_2_vectors[j]])[0][0])
    mean_similarity_2 = np.mean(similarity_scores_2) if similarity_scores_2 else 0

    # 레이블 2 문서들 간의 유사도 평균
    similarity_scores_3 = []
    for i in range(len(label_3_vectors)):
        for j in range(i + 1, len(label_3_vectors)):
            similarity_scores_3.append(cosine_similarity([label_3_vectors[i]], [label_3_vectors[j]])[0][0])
    mean_similarity_3 = np.mean(similarity_scores_3) if similarity_scores_3 else 0

        # 레이블 2 문서들 간의 유사도 평균
    similarity_scores_4 = []
    for i in range(len(label_4_vectors)):
        for j in range(i + 1, len(label_4_vectors)):
            similarity_scores_4.append(cosine_similarity([label_4_vectors[i]], [label_4_vectors[j]])[0][0])
    mean_similarity_4 = np.mean(similarity_scores_4) if similarity_scores_4 else 0

        # 레이블 2 문서들 간의 유사도 평균
    similarity_scores_5 = []
    for i in range(len(label_5_vectors)):
        for j in range(i + 1, len(label_5_vectors)):
            similarity_scores_5.append(cosine_similarity([label_5_vectors[i]], [label_5_vectors[j]])[0][0])
    mean_similarity_5 = np.mean(similarity_scores_5) if similarity_scores_5 else 0

    # 두 유사도의 평균을 최종 점수로 사용
    final_similarity_score = (mean_similarity_1 + mean_similarity_2 + mean_similarity_3 + mean_similarity_4 + mean_similarity_5) / 5
    
    return final_similarity_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

print(f'Best parameters: {study.best_params}')
print(f'Best similarity score: {study.best_value}')

[I 2024-06-27 21:44:57,432] A new study created in memory with name: no-name-6f87696f-86a2-4986-8698-73a6ede4b4f9
[I 2024-06-27 21:45:21,585] Trial 6 finished with value: 0.6756621360778808 and parameters: {'vector_size': 43, 'window': 7, 'min_count': 3, 'epochs': 11}. Best is trial 6 with value: 0.6756621360778808.
[I 2024-06-27 21:45:27,083] Trial 1 finished with value: 0.39259097576141355 and parameters: {'vector_size': 29, 'window': 4, 'min_count': 3, 'epochs': 22}. Best is trial 6 with value: 0.6756621360778808.
[I 2024-06-27 21:45:46,268] Trial 4 finished with value: 0.32061400413513186 and parameters: {'vector_size': 42, 'window': 4, 'min_count': 2, 'epochs': 29}. Best is trial 6 with value: 0.6756621360778808.
[I 2024-06-27 21:45:51,124] Trial 5 finished with value: 0.41011853218078614 and parameters: {'vector_size': 45, 'window': 10, 'min_count': 3, 'epochs': 18}. Best is trial 6 with value: 0.6756621360778808.
[I 2024-06-27 21:45:55,631] Trial 7 finished with value: 0.4002655

In [None]:
best_params = study.best_params

best_vector_size = best_params['vector_size']
best_window = best_params['window']
best_min_count = best_params['min_count']
best_epochs = best_params['epochs']

best_d2vmodel = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs)
best_d2vmodel.build_vocab(my_tagged_corpus)
best_d2vmodel.train(my_tagged_corpus, total_examples=best_d2vmodel.corpus_count, epochs=best_d2vmodel.epochs)

import pickle

with open('d2v_model_rg.pkl', 'wb') as f:
    pickle.dump(best_d2vmodel, f)

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score

# # 데이터 준비
# X = [best_d2vmodel.dv[i] for i in range(len(documents_corpus))]
# y = labels  # 레이블이 회귀 타겟 값이라고 가정

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# def objective(trial):
#     # 모델 선택
#     regressor_name = trial.suggest_categorical('regressor', ['LinearRegression', 'SVR', 'RandomForest', 'XGBoost', 'LightGBM'])
    
#     if regressor_name == 'LinearRegression':
#         model = LinearRegression()

#     elif regressor_name == 'SVR':
#         svr_c = trial.suggest_float('svr_c', 1e-3, 1e3, log=True)
#         svr_epsilon = trial.suggest_float('svr_epsilon', 1e-3, 1.0, log=True)
#         svr_kernel = trial.suggest_categorical('svr_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
#         model = SVR(C=svr_c, epsilon=svr_epsilon, kernel=svr_kernel)

#     elif regressor_name == 'RandomForest':
#         rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 500)
#         rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
#         rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
#         rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 4)
#         rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
#         model = RandomForestRegressor(n_estimators=rf_n_estimators, max_depth=rf_max_depth,
#                                       min_samples_split=rf_min_samples_split, min_samples_leaf=rf_min_samples_leaf,
#                                       max_features=rf_max_features, random_state=42)

#     elif regressor_name == 'XGBoost':
#         xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 50, 500)
#         xgb_max_depth = trial.suggest_int('xgb_max_depth', 2, 32)
#         xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.3, log=True)
#         xgb_subsample = trial.suggest_float('xgb_subsample', 0.5, 1.0)
#         xgb_colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
#         xgb_gamma = trial.suggest_float('xgb_gamma', 0, 5)
#         model = XGBRegressor(n_estimators=xgb_n_estimators, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate,
#                              subsample=xgb_subsample, colsample_bytree=xgb_colsample_bytree, gamma=xgb_gamma, random_state=42)

#     elif regressor_name == 'LightGBM':
#         lgb_n_estimators = trial.suggest_int('lgb_n_estimators', 50, 500)
#         lgb_max_depth = trial.suggest_int('lgb_max_depth', 2, 32)
#         lgb_learning_rate = trial.suggest_float('lgb_learning_rate', 0.01, 0.3, log=True)
#         lgb_num_leaves = trial.suggest_int('lgb_num_leaves', 20, 150)
#         lgb_min_child_samples = trial.suggest_int('lgb_min_child_samples', 5, 100)
#         lgb_subsample = trial.suggest_float('lgb_subsample', 0.5, 1.0)
#         lgb_colsample_bytree = trial.suggest_float('lgb_colsample_bytree', 0.5, 1.0)
#         model = LGBMRegressor(n_estimators=lgb_n_estimators, max_depth=lgb_max_depth, learning_rate=lgb_learning_rate,
#                               num_leaves=lgb_num_leaves, min_child_samples=lgb_min_child_samples,
#                               subsample=lgb_subsample, colsample_bytree=lgb_colsample_bytree, random_state=42)

#     # 교차 검증으로 모델 평가
#     score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
#     r2 = score.mean()
#     return r2

# # Optuna 최적화
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=1000, n_jobs=-1)

# # 최적의 하이퍼파라미터로 모델 학습
# best_params = study.best_params
# regressor_name = best_params.pop('regressor')

# if regressor_name == 'LinearRegression':
#     model = LinearRegression()

# elif regressor_name == 'SVR':
#     model = SVR(C=best_params['svr_c'], epsilon=best_params['svr_epsilon'], kernel=best_params['svr_kernel'])

# elif regressor_name == 'RandomForest':
#     model = RandomForestRegressor(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'],
#                                   min_samples_split=best_params['rf_min_samples_split'], min_samples_leaf=best_params['rf_min_samples_leaf'],
#                                   max_features=best_params['rf_max_features'], random_state=42)

# elif regressor_name == 'XGBoost':
#     model = XGBRegressor(n_estimators=best_params['xgb_n_estimators'], max_depth=best_params['xgb_max_depth'], learning_rate=best_params['xgb_learning_rate'],
#                          subsample=best_params['xgb_subsample'], colsample_bytree=best_params['xgb_colsample_bytree'], gamma=best_params['xgb_gamma'], random_state=42)

# elif regressor_name == 'LightGBM':
#     model = LGBMRegressor(n_estimators=best_params['lgb_n_estimators'], max_depth=best_params['lgb_max_depth'], learning_rate=best_params['lgb_learning_rate'])

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# r2 = r2_score(y_test, y_pred)

# print(f'Best model: {regressor_name}')
# print(f'Best parameters: {best_params}')
# print(f'Test R2: {r2}')

[I 2024-06-27 21:35:23,590] A new study created in memory with name: no-name-ef1d3993-7dea-4084-b5eb-047c39bfe219
[I 2024-06-27 21:35:23,756] Trial 3 finished with value: -0.8069068825379482 and parameters: {'regressor': 'LinearRegression'}. Best is trial 3 with value: -0.8069068825379482.
[I 2024-06-27 21:35:23,772] Trial 2 finished with value: 0.1046299394068239 and parameters: {'regressor': 'SVR', 'svr_c': 45.36127953443427, 'svr_epsilon': 0.43265978720338516, 'svr_kernel': 'linear'}. Best is trial 2 with value: 0.1046299394068239.
[I 2024-06-27 21:35:23,798] Trial 5 finished with value: -0.8069068825379482 and parameters: {'regressor': 'LinearRegression'}. Best is trial 2 with value: 0.1046299394068239.
[I 2024-06-27 21:35:23,855] Trial 8 finished with value: -0.8069068825379482 and parameters: {'regressor': 'LinearRegression'}. Best is trial 2 with value: 0.1046299394068239.
[I 2024-06-27 21:35:25,643] Trial 1 finished with value: -0.0757878448674874 and parameters: {'regressor': 

KeyboardInterrupt: 