In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import csv
import optuna

In [2]:
# 셀에서 출력 정확도 설정.
%precision %.3f

'%.3f'

In [3]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

documents = documents[:77] + documents[-77:]

In [4]:
# 전문가 평점 3점 이상인 상위 175개는 label에 1, 나머지 하위는 label에 0 값 부여
# 분류 예측용 label
labels = []
for i in range(154):
    if i < 77:
        labels.append(1)
    else:
        labels.append(0)

In [5]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(documents_corpus, labels, test_size=0.3, random_state=123)

In [7]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(X_train)]

In [8]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 20, 60)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 3, 15)
    dm = trial.suggest_categorical('dm', [0, 1])

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, negative=negative, seed=123)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Randomforest 하이퍼파라미터 튜닝
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 25)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # Randomforest 모델 생성
    model = RandomForestClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features,
                                    random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = f1_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-27 16:01:56,210] A new study created in memory with name: no-name-761ddc59-1b82-4a35-a4e0-6a8696070005
[I 2024-06-27 16:02:08,371] Trial 7 finished with value: 0.6190476190476191 and parameters: {'vector_size': 24, 'window': 4, 'min_count': 5, 'epochs': 30, 'negative': 5, 'dm': 0, 'n_estimators': 101, 'max_depth': 24, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 7 with value: 0.6190476190476191.
[I 2024-06-27 16:02:08,975] Trial 3 finished with value: 0.6956521739130435 and parameters: {'vector_size': 27, 'window': 3, 'min_count': 3, 'epochs': 25, 'negative': 10, 'dm': 0, 'n_estimators': 123, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 3 with value: 0.6956521739130435.
[I 2024-06-27 16:02:12,843] Trial 4 finished with value: 0.6521739130434783 and parameters: {'vector_size': 22, 'window': 5, 'min_count': 3, 'epochs': 30, 'negative': 13, 'dm': 0, 'n_estimators': 192, 'max_depth

Best hyperparameters:  {'vector_size': 36, 'window': 10, 'min_count': 1, 'epochs': 20, 'negative': 15, 'dm': 0, 'n_estimators': 146, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}


In [16]:
best_params = study.best_params

# Doc2Vec 모델 생성
model_rf = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], 
                negative=best_params['negative'], seed=123)

model_rf.build_vocab(my_tagged_corpus)
model_rf.train(my_tagged_corpus, total_examples=model_rf.corpus_count, epochs=model_rf.epochs)

# RandomForest 모델 생성
clf_rf = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                            max_depth=best_params['max_depth'],
                            min_samples_split=best_params['min_samples_split'],
                            min_samples_leaf=best_params['min_samples_leaf'],
                            max_features=best_params['max_features'],
                            random_state=123)

# Doc2Vec 벡터 생성
X_train_vec = [model_rf.infer_vector(doc) for doc in X_train]
X_test_vec = [model_rf.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
clf_rf.fit(X_train_vec, y_train)
y_pred = clf_rf.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.46      0.59        24
           1       0.62      0.91      0.74        23

    accuracy                           0.68        47
   macro avg       0.73      0.69      0.67        47
weighted avg       0.73      0.68      0.66        47



In [17]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 20, 60)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 3, 15)
    dm = trial.suggest_categorical('dm', [0, 1])

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, negative=negative, seed=123)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # SVM 하이퍼파라미터 튜닝
    C = trial.suggest_float('C', 0.01, 100.0, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    gamma = trial.suggest_float('gamma', 1e-4, 1.0, log=True) if kernel in ['rbf', 'poly'] else 'scale'

    # SVM 모델 생성
    model = SVC(
        C=C,
        kernel=kernel,
        degree=degree,
        gamma=gamma,
        random_state=123
    )

    model.fit(X_train_vec, y_train)
    score = f1_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-27 16:11:07,802] A new study created in memory with name: no-name-261226ed-74a6-465a-8430-70dc371ef23f
[I 2024-06-27 16:11:15,865] Trial 5 finished with value: 0.6571428571428571 and parameters: {'vector_size': 49, 'window': 7, 'min_count': 4, 'epochs': 15, 'negative': 11, 'dm': 0, 'C': 0.03531586854050158, 'kernel': 'poly', 'degree': 4, 'gamma': 0.0002518541264046507}. Best is trial 5 with value: 0.6571428571428571.
[I 2024-06-27 16:11:15,911] Trial 2 finished with value: 0.08333333333333333 and parameters: {'vector_size': 52, 'window': 8, 'min_count': 1, 'epochs': 15, 'negative': 8, 'dm': 0, 'C': 61.313296907253665, 'kernel': 'linear'}. Best is trial 5 with value: 0.6571428571428571.
[I 2024-06-27 16:11:18,061] Trial 4 finished with value: 0.6086956521739131 and parameters: {'vector_size': 20, 'window': 5, 'min_count': 10, 'epochs': 20, 'negative': 15, 'dm': 0, 'C': 0.09938616616732486, 'kernel': 'linear'}. Best is trial 5 with value: 0.6571428571428571.
[I 2024-06-27 16:1

Best hyperparameters:  {'vector_size': 60, 'window': 5, 'min_count': 10, 'epochs': 30, 'negative': 4, 'dm': 1, 'C': 5.777691119396119, 'kernel': 'poly', 'degree': 3, 'gamma': 0.0018378591974425448}


In [9]:
best_params = {'vector_size': 60, 'window': 5, 'min_count': 10, 'epochs': 30, 'negative': 4, 'dm': 1, 'C': 5.777691119396119, 'kernel': 'poly', 'degree': 3, 'gamma': 0.0018378591974425448}

# Doc2Vec 모델 생성
model_svc = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], 
                negative=best_params['negative'], seed=123)
model_svc.build_vocab(my_tagged_corpus)
model_svc.train(my_tagged_corpus, total_examples=model_svc.corpus_count, epochs=model_svc.epochs)

# SVM 분류 모델 생성
clf_svc = SVC(C=best_params['C'], kernel=best_params['kernel'], gamma=best_params['gamma'], degree=best_params['degree'], random_state=123)#degree=best_params['degree'], gamma=best_params['gamma'], 

# Doc2Vec 벡터 생성
X_train_vec = [model_svc.infer_vector(doc) for doc in X_train]
X_test_vec = [model_svc.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
clf_svc.fit(X_train_vec, y_train)
y_pred = clf_svc.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.21      0.33        24
           1       0.54      0.96      0.69        23

    accuracy                           0.57        47
   macro avg       0.68      0.58      0.51        47
weighted avg       0.69      0.57      0.51        47



In [27]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 20, 60)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 3, 15)
    dm = trial.suggest_categorical('dm', [0, 1])

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, negative=negative, seed=123)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # XGBOOST 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 25)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    gamma = trial.suggest_float('gamma', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    
    # XGboost 모델 생성
    model = XGBClassifier(max_depth=max_depth,
                        n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        gamma=gamma,
                        min_child_weight=min_child_weight,
                        random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = f1_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-27 16:19:19,084] A new study created in memory with name: no-name-4db9d49e-0f7f-4dae-84a8-65f3ac9c1e57
[I 2024-06-27 16:19:28,085] Trial 2 finished with value: 0.6153846153846154 and parameters: {'vector_size': 40, 'window': 4, 'min_count': 4, 'epochs': 10, 'negative': 10, 'dm': 0, 'max_depth': 25, 'n_estimators': 299, 'learning_rate': 0.1044244810697849, 'gamma': 0.5458948767291024, 'min_child_weight': 4}. Best is trial 2 with value: 0.6153846153846154.
[I 2024-06-27 16:19:29,571] Trial 5 finished with value: 0.6538461538461539 and parameters: {'vector_size': 47, 'window': 4, 'min_count': 5, 'epochs': 15, 'negative': 10, 'dm': 0, 'max_depth': 9, 'n_estimators': 168, 'learning_rate': 0.06354769271209745, 'gamma': 0.5008849416072466, 'min_child_weight': 5}. Best is trial 5 with value: 0.6538461538461539.
[I 2024-06-27 16:19:31,582] Trial 6 finished with value: 0.5789473684210527 and parameters: {'vector_size': 23, 'window': 5, 'min_count': 4, 'epochs': 20, 'negative': 3, 'dm'

Best hyperparameters:  {'vector_size': 26, 'window': 9, 'min_count': 6, 'epochs': 20, 'negative': 12, 'dm': 1, 'max_depth': 6, 'n_estimators': 51, 'learning_rate': 0.2063074043065741, 'gamma': 0.8699872667994754, 'min_child_weight': 4}


In [33]:
best_params = study.best_params

# Doc2Vec 모델 생성
model_xgb = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], 
                negative=best_params['negative'], seed=123)
model_xgb.build_vocab(my_tagged_corpus)
model_xgb.train(my_tagged_corpus, total_examples=model_xgb.corpus_count, epochs=model_xgb.epochs)
# XGBoost 모델 생성
clf_xgb = XGBClassifier(
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    gamma=best_params['gamma'],
    min_child_weight=best_params['min_child_weight'],
    random_state=123
)

# Doc2Vec 벡터 생성
X_train_vec = [model_xgb.infer_vector(doc) for doc in X_train]
X_test_vec = [model_xgb.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
clf_xgb.fit(X_train_vec, y_train)
y_pred = clf_xgb.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.68      0.71      0.69        24
           1       0.68      0.65      0.67        23

    accuracy                           0.68        47
   macro avg       0.68      0.68      0.68        47
weighted avg       0.68      0.68      0.68        47



In [34]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 20, 60)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 3, 15)
    dm = trial.suggest_categorical('dm', [0, 1])

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, negative=negative, seed=123)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)    
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # LightGBM 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 25)
    num_leaves = trial.suggest_int('num_leaves', 10, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 50)

    # LightGBM 모델 생성
    model = LGBMClassifier(
        max_depth=max_depth,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_child_samples=min_child_samples,
        random_state=123
    )
    
    model.fit(X_train_vec, y_train)
    score = f1_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-27 16:28:14,853] A new study created in memory with name: no-name-54ccf294-5fdd-4d2c-ae7c-2c9e269d87ed
[I 2024-06-27 16:28:23,267] Trial 5 finished with value: 0.6521739130434783 and parameters: {'vector_size': 31, 'window': 5, 'min_count': 9, 'epochs': 15, 'negative': 12, 'dm': 0, 'max_depth': 3, 'num_leaves': 115, 'learning_rate': 0.09730905491270994, 'n_estimators': 171, 'min_child_samples': 45}. Best is trial 5 with value: 0.6521739130434783.
[I 2024-06-27 16:28:23,507] Trial 7 finished with value: 0.5116279069767442 and parameters: {'vector_size': 32, 'window': 3, 'min_count': 4, 'epochs': 20, 'negative': 6, 'dm': 0, 'max_depth': 4, 'num_leaves': 144, 'learning_rate': 0.18622635886309008, 'n_estimators': 261, 'min_child_samples': 24}. Best is trial 5 with value: 0.6521739130434783.
[I 2024-06-27 16:28:25,158] Trial 6 finished with value: 0.4 and parameters: {'vector_size': 31, 'window': 6, 'min_count': 5, 'epochs': 20, 'negative': 8, 'dm': 0, 'max_depth': 22, 'num_leave

Best hyperparameters:  {'vector_size': 46, 'window': 5, 'min_count': 1, 'epochs': 30, 'negative': 9, 'dm': 0, 'max_depth': 6, 'num_leaves': 122, 'learning_rate': 0.15228562627795603, 'n_estimators': 64, 'min_child_samples': 44}


In [49]:
best_params = study.best_params

# Doc2Vec 모델 생성
model_lgbm = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], 
                negative=best_params['negative'], seed=123)
model_lgbm.build_vocab(my_tagged_corpus)
model_lgbm.train(my_tagged_corpus, total_examples=model_lgbm.corpus_count, epochs=model_lgbm.epochs)

# LightGBM 모델 생성
clf_lgbm = LGBMClassifier(max_depth=best_params['max_depth'], num_leaves=best_params['num_leaves'], 
                    learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'], 
                    min_child_samples=best_params['min_child_samples'], random_state=123)


# Doc2Vec 벡터 생성
X_train_vec = [model_lgbm.infer_vector(doc) for doc in X_train]
X_test_vec = [model_lgbm.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
clf_lgbm.fit(X_train_vec, y_train)
y_pred = clf_lgbm.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

[LightGBM] [Info] Number of positive: 54, number of negative: 53
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1720
[LightGBM] [Info] Number of data points in the train set: 107, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504673 -> initscore=0.018692
[LightGBM] [Info] Start training from score 0.018692
              precision    recall  f1-score   support

           0       0.79      0.62      0.70        24
           1       0.68      0.83      0.75        23

    accuracy                           0.72        47
   macro avg       0.73      0.73      0.72        47
weighted avg       0.74      0.72      0.72        47



In [50]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 20, 60)
    window = trial.suggest_int('window', 2, 10)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 30, step=5)
    negative = trial.suggest_int('negative', 3, 15)
    dm = trial.suggest_categorical('dm', [0, 1])

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, negative=negative, seed=123)
    model.build_vocab(my_tagged_corpus)
    model.train(my_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Logistic Regression 하이퍼파라미터 튜닝
    C = trial.suggest_float('C', 0.01, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Logistic Regression 모델 생성
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        random_state=123
    )
    
    model.fit(X_train_vec, y_train)
    score = f1_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-27 16:37:59,280] A new study created in memory with name: no-name-c5d1790d-184f-4f81-8351-4c884ef46115
[I 2024-06-27 16:38:06,770] Trial 1 finished with value: 0.6538461538461539 and parameters: {'vector_size': 30, 'window': 5, 'min_count': 8, 'epochs': 15, 'negative': 11, 'dm': 0, 'C': 0.3104241839333145, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 1 with value: 0.6538461538461539.
[I 2024-06-27 16:38:07,755] Trial 4 finished with value: 0.7272727272727273 and parameters: {'vector_size': 27, 'window': 7, 'min_count': 1, 'epochs': 20, 'negative': 4, 'dm': 0, 'C': 1.3667023182306257, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 4 with value: 0.7272727272727273.
[I 2024-06-27 16:38:08,706] Trial 2 finished with value: 0.6521739130434783 and parameters: {'vector_size': 33, 'window': 4, 'min_count': 3, 'epochs': 10, 'negative': 9, 'dm': 1, 'C': 3.6045332812228748, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 4 with value: 0.7272727272727273.
[I 202

Best hyperparameters:  {'vector_size': 59, 'window': 9, 'min_count': 6, 'epochs': 30, 'negative': 8, 'dm': 0, 'C': 0.5841895195907552, 'penalty': 'l1', 'solver': 'liblinear'}


In [54]:
best_params = study.best_params

# Doc2Vec 모델 생성
model_lr = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], 
                negative=best_params['negative'], seed=123)
model_lr.build_vocab(my_tagged_corpus)
model_lr.train(my_tagged_corpus, total_examples=model_lr.corpus_count, epochs=model_lr.epochs)

# Logistic Regressor 모델 생성
clf_lr = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], solver=best_params['solver'], random_state=123)

# Doc2Vec 벡터 생성
X_train_vec = [model_lr.infer_vector(doc) for doc in X_train]
X_test_vec = [model_lr.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
clf_lr.fit(X_train_vec, y_train)
y_pred = clf_lr.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.76      0.67      0.71        24
           1       0.69      0.78      0.73        23

    accuracy                           0.72        47
   macro avg       0.73      0.72      0.72        47
weighted avg       0.73      0.72      0.72        47



In [55]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

# 사용하지 않은 2점짜리 자소서 리스트
documents = documents[-175:-77]

In [56]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [58]:
lr_vec = [model_lr.infer_vector(doc) for doc in documents_corpus]
svc_vec = [model_svc.infer_vector(doc) for doc in documents_corpus]
rf_vec = [model_rf.infer_vector(doc) for doc in documents_corpus]
xgb_vec = [model_xgb.infer_vector(doc) for doc in documents_corpus]
lgbm_vec = [model_lgbm.infer_vector(doc) for doc in documents_corpus]

# 총 98개 자소서 예측
print(clf_lr.predict(lr_vec).sum())
print(clf_svc.predict(svc_vec).sum())
print(clf_rf.predict(rf_vec).sum())
print(clf_xgb.predict(xgb_vec).sum())
print(clf_lgbm.predict(lgbm_vec).sum())

68
98
87
46
72


In [23]:
import pickle

with open('d2v_model_rf_small.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

with open('d2v_model_svc_small.pkl', 'wb') as f:
    pickle.dump(model_svc, f)

with open('d2v_model_xgb_small.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)

with open('d2v_model_lgbm_small.pkl', 'wb') as f:
    pickle.dump(model_lgbm, f)

with open('d2v_model_lr_small.pkl', 'wb') as f:
    pickle.dump(model_lr, f)
    
with open('clf_model_rf_small.pkl', 'wb') as f:
    pickle.dump(clf_rf, f)

with open('clf_model_svc_small.pkl', 'wb') as f:
    pickle.dump(clf_svc, f)
    
with open('clf_model_xgb_small.pkl', 'wb') as f:
    pickle.dump(clf_xgb, f)

with open('clf_model_lgbm_small.pkl', 'wb') as f:
    pickle.dump(clf_lgbm, f)

with open('clf_model_lr_small.pkl', 'wb') as f:
    pickle.dump(clf_lr, f)