In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import csv
import random
import optuna

In [2]:
# 셀에서 출력 정확도 설정.
%precision %.3f

'%.3f'

In [3]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])
        
documents_5 = documents[:22]
documents_4 = documents[22:77]
documents_3 = documents[77:175]
documents_2 = documents[175:302]
documents_1 = documents[302:]

documents = documents_5 + documents_4 + documents_2 + documents_1

In [13]:
# 전문가 평점 1~5점 각 15개씩 랜덤으로 가져와서 점수 라벨링
# 회귀 예측용 label
labels = []
for i in range(219):
    if i < 22:
        labels.append(5)
    elif i < 77:
        labels.append(4)
    elif i < 204:
        labels.append(2)        
    else:
        labels.append(1)

In [5]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [6]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(documents_corpus)]

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def objective(trial):
    # 하이퍼파라미터 샘플링
    vector_size = trial.suggest_int('vector_size', 20, 300)
    window = trial.suggest_int('window', 1, 15)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 2, 15)

    # Doc2Vec 모델 생성 및 학습
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, negative=negative)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # 레이블별로 코사인 유사도 평가
    label_1_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 1]
    label_2_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 2]
    label_4_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 4]
    label_5_vectors = [model.dv[i] for i, label in enumerate(labels) if label == 5]

    # 레이블 1 문서들 간의 유사도 평균
    similarity_scores_1 = []
    for i in range(len(label_1_vectors)):
        for j in range(i + 1, len(label_1_vectors)):
            similarity_scores_1.append(cosine_similarity([label_1_vectors[i]], [label_1_vectors[j]])[0][0])
    mean_similarity_1 = np.mean(similarity_scores_1) if similarity_scores_1 else 0

    # 레이블 2 문서들 간의 유사도 평균
    similarity_scores_2 = []
    for i in range(len(label_2_vectors)):
        for j in range(i + 1, len(label_2_vectors)):
            similarity_scores_2.append(cosine_similarity([label_2_vectors[i]], [label_2_vectors[j]])[0][0])
    mean_similarity_2 = np.mean(similarity_scores_2) if similarity_scores_2 else 0

    # 레이블 4 문서들 간의 유사도 평균
    similarity_scores_4 = []
    for i in range(len(label_4_vectors)):
        for j in range(i + 1, len(label_4_vectors)):
            similarity_scores_4.append(cosine_similarity([label_4_vectors[i]], [label_4_vectors[j]])[0][0])
    mean_similarity_4 = np.mean(similarity_scores_4) if similarity_scores_4 else 0

    # 레이블 5 문서들 간의 유사도 평균
    similarity_scores_5 = []
    for i in range(len(label_5_vectors)):
        for j in range(i + 1, len(label_5_vectors)):
            similarity_scores_5.append(cosine_similarity([label_5_vectors[i]], [label_5_vectors[j]])[0][0])
    mean_similarity_5 = np.mean(similarity_scores_5) if similarity_scores_5 else 0

    # 두 유사도의 평균을 최종 점수로 사용
    final_similarity_score = (mean_similarity_1 + mean_similarity_2 + mean_similarity_4 + mean_similarity_5) / 4
    
    return final_similarity_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

print(f'Best parameters: {study.best_params}')
print(f'Best similarity score: {study.best_value}')

[I 2024-06-28 15:12:25,763] A new study created in memory with name: no-name-66296168-326c-43f5-8780-9b0bd7de71cc
[I 2024-06-28 15:12:43,891] Trial 2 finished with value: 0.6140442490577698 and parameters: {'vector_size': 143, 'window': 8, 'min_count': 4, 'epochs': 15, 'negative': 9}. Best is trial 2 with value: 0.6140442490577698.
[I 2024-06-28 15:12:44,268] Trial 1 finished with value: 0.8056887984275818 and parameters: {'vector_size': 68, 'window': 13, 'min_count': 4, 'epochs': 10, 'negative': 11}. Best is trial 1 with value: 0.8056887984275818.
[I 2024-06-28 15:13:03,704] Trial 5 finished with value: 0.3907237648963928 and parameters: {'vector_size': 287, 'window': 5, 'min_count': 9, 'epochs': 20, 'negative': 9}. Best is trial 1 with value: 0.8056887984275818.
[I 2024-06-28 15:13:08,198] Trial 0 finished with value: 0.35524892807006836 and parameters: {'vector_size': 29, 'window': 13, 'min_count': 4, 'epochs': 25, 'negative': 9}. Best is trial 1 with value: 0.8056887984275818.
[I 2

Best parameters: {'vector_size': 143, 'window': 2, 'min_count': 1, 'epochs': 10, 'negative': 3}
Best similarity score: 0.982001543045044


In [16]:
best_params = study.best_params

best_vector_size = best_params['vector_size']
best_window = best_params['window']
best_min_count = best_params['min_count']
best_epochs = best_params['epochs']
best_negative = best_params['negative']

best_d2vmodel = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs, negative=best_negative)
best_d2vmodel.build_vocab(my_tagged_corpus)
best_d2vmodel.train(my_tagged_corpus, total_examples=best_d2vmodel.corpus_count, epochs=best_d2vmodel.epochs)

import pickle

with open('d2v_model_rg.pkl', 'wb') as f:
    pickle.dump(best_d2vmodel, f)

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 데이터 준비
X = [best_d2vmodel.dv[i] for i in range(len(documents_corpus))]
y = labels  # 레이블이 회귀 타겟 값이라고 가정

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def objective(trial):
    # 모델 선택
    regressor_name = trial.suggest_categorical('regressor', ['LinearRegression', 'SVR', 'RandomForest', 'XGBoost', 'LightGBM'])
    
    if regressor_name == 'LinearRegression':
        model = LinearRegression()

    elif regressor_name == 'SVR':
        svr_c = trial.suggest_float('svr_c', 1e-3, 1e3, log=True)
        svr_epsilon = trial.suggest_float('svr_epsilon', 1e-3, 1.0, log=True)
        svr_kernel = trial.suggest_categorical('svr_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        model = SVR(C=svr_c, epsilon=svr_epsilon, kernel=svr_kernel)

    elif regressor_name == 'RandomForest':
        rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 1000)
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
        rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 4)
        rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
        model = RandomForestRegressor(n_estimators=rf_n_estimators, max_depth=rf_max_depth,
                                      min_samples_split=rf_min_samples_split, min_samples_leaf=rf_min_samples_leaf,
                                      max_features=rf_max_features, random_state=42)

    elif regressor_name == 'XGBoost':
        xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 50, 1000)
        xgb_max_depth = trial.suggest_int('xgb_max_depth', 2, 32)
        xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.3, log=True)
        xgb_subsample = trial.suggest_float('xgb_subsample', 0.5, 1.0)
        xgb_colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
        xgb_gamma = trial.suggest_float('xgb_gamma', 0, 5)
        model = XGBRegressor(n_estimators=xgb_n_estimators, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate,
                             subsample=xgb_subsample, colsample_bytree=xgb_colsample_bytree, gamma=xgb_gamma, random_state=42)

    elif regressor_name == 'LightGBM':
        lgb_n_estimators = trial.suggest_int('lgb_n_estimators', 50, 1000)
        lgb_max_depth = trial.suggest_int('lgb_max_depth', 2, 32)
        lgb_learning_rate = trial.suggest_float('lgb_learning_rate', 0.01, 0.3, log=True)
        lgb_num_leaves = trial.suggest_int('lgb_num_leaves', 20, 200)
        lgb_min_child_samples = trial.suggest_int('lgb_min_child_samples', 5, 100)
        lgb_subsample = trial.suggest_float('lgb_subsample', 0.5, 1.0)
        lgb_colsample_bytree = trial.suggest_float('lgb_colsample_bytree', 0.5, 1.0)
        model = LGBMRegressor(n_estimators=lgb_n_estimators, max_depth=lgb_max_depth, learning_rate=lgb_learning_rate,
                              num_leaves=lgb_num_leaves, min_child_samples=lgb_min_child_samples,
                              subsample=lgb_subsample, colsample_bytree=lgb_colsample_bytree, random_state=42)

    # 교차 검증으로 모델 평가
    model.fit(X_train, y_train)
    return r2_score(y_test, model.predict(X_test))

# Optuna 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2000, n_jobs=-1)

# 최적의 하이퍼파라미터로 모델 학습
best_params = study.best_params
regressor_name = best_params.pop('regressor')

if regressor_name == 'LinearRegression':
    model = LinearRegression()

elif regressor_name == 'SVR':
    model = SVR(C=best_params['svr_c'], epsilon=best_params['svr_epsilon'], kernel=best_params['svr_kernel'])

elif regressor_name == 'RandomForest':
    model = RandomForestRegressor(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'],
                                  min_samples_split=best_params['rf_min_samples_split'], min_samples_leaf=best_params['rf_min_samples_leaf'],
                                  max_features=best_params['rf_max_features'], random_state=42)

elif regressor_name == 'XGBoost':
    model = XGBRegressor(n_estimators=best_params['xgb_n_estimators'], max_depth=best_params['xgb_max_depth'], learning_rate=best_params['xgb_learning_rate'],
                         subsample=best_params['xgb_subsample'], colsample_bytree=best_params['xgb_colsample_bytree'], gamma=best_params['xgb_gamma'], random_state=42)

elif regressor_name == 'LightGBM':
    model = LGBMRegressor(n_estimators=best_params['lgb_n_estimators'], max_depth=best_params['lgb_max_depth'], learning_rate=best_params['lgb_learning_rate'])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f'Best model: {regressor_name}')
print(f'Best parameters: {best_params}')
print(f'Test R2: {r2}')

[I 2024-06-28 15:36:12,217] A new study created in memory with name: no-name-0f3c9df5-80a3-4ace-8a2d-3b9af29600e3
[I 2024-06-28 15:36:12,266] Trial 0 finished with value: -17.933268615615276 and parameters: {'regressor': 'LinearRegression'}. Best is trial 0 with value: -17.933268615615276.
[I 2024-06-28 15:36:12,307] Trial 3 finished with value: -0.2643061128390882 and parameters: {'regressor': 'SVR', 'svr_c': 0.004064273499243021, 'svr_epsilon': 0.0018728776852793719, 'svr_kernel': 'rbf'}. Best is trial 3 with value: -0.2643061128390882.
[I 2024-06-28 15:36:12,465] Trial 6 finished with value: 0.020901188603697052 and parameters: {'regressor': 'SVR', 'svr_c': 68.68162329305389, 'svr_epsilon': 0.02132847174887659, 'svr_kernel': 'rbf'}. Best is trial 6 with value: 0.020901188603697052.
[I 2024-06-28 15:36:12,480] Trial 4 finished with value: -17.933268615615276 and parameters: {'regressor': 'LinearRegression'}. Best is trial 6 with value: 0.020901188603697052.
[I 2024-06-28 15:36:13,564

Best model: XGBoost
Best parameters: {'xgb_n_estimators': 508, 'xgb_max_depth': 2, 'xgb_learning_rate': 0.10805790378871648, 'xgb_subsample': 0.5495086625510451, 'xgb_colsample_bytree': 0.9052680432182556, 'xgb_gamma': 3.1681067708128308}
Test R2: 0.2112566547530096


In [27]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))

1.068

In [29]:
max(y_pred)

4.052207

In [30]:
test_corpus = []
for document in documents_2:
    test_corpus.append(preprocess(document))

In [31]:
test_vecs = [best_d2vmodel.infer_vector(doc) for doc in test_corpus]
test_labels = [2] * 127
test_pred = model.predict(test_vecs)

In [32]:
np.sqrt(mean_squared_error(test_labels, test_pred))

1.260

In [25]:
test_pred

array([ 3.49882243,  2.95865403,  4.81826178,  5.47570554,  3.12649834,
        2.82951796,  1.12633932,  2.16937378,  2.85392286,  0.90458831,
        3.58989377,  2.52769194,  2.44673768,  2.41037714,  2.26576995,
        3.60179088,  1.56761193,  3.68536408,  3.33244227,  2.03742792,
        2.49183163,  0.46140895,  4.50100643,  2.80192159,  4.2372591 ,
        3.7851539 ,  3.50294167,  4.26752369,  2.86347876,  2.23340166,
        2.98798754,  4.62177135,  6.79594039,  1.48484422,  2.69265639,
        2.12499626,  0.71631325,  4.45642231,  2.46348803,  2.70955045,
        5.15851161,  2.40191273,  5.31181463,  3.57300054,  5.08155651,
        6.37427687,  2.95603977,  3.2077717 ,  2.25864259,  4.92050665,
        4.56900781,  2.80747694,  1.60242738,  2.04529542,  3.28722866,
        3.07775884,  4.78680559,  4.1353597 ,  7.23566108,  3.74431329,
        2.74542799,  2.3640184 ,  3.02647589,  4.0572998 ,  4.13392311,
        4.94671143,  3.28899328,  2.54326377,  2.30439211,  4.84