In [2]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import optuna
import csv

# Load and preprocess data (as before)
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

# 전문가 평점 1~5점 각 15개씩 랜덤으로 가져와서 점수 라벨링
# 회귀 예측용 label
labels = []
for i in range(317):
    if i < 22:
        labels.append(5)
    elif i < 77:
        labels.append(4)
    elif i < 175:
        labels.append(3)
    elif i < 302:
        labels.append(2)        
    else:
        labels.append(1)

# Preprocess the essays
from nltk.tokenize import word_tokenize

def preprocess(text):
    return word_tokenize(text.lower())

# Create tagged documents
tagged_data = [TaggedDocument(words=preprocess(essay), tags=[str(i)]) for i, essay in enumerate(documents)]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(tagged_data, labels, test_size=0.2, random_state=42)

# Modified build_doc2vec_model function (as before)
from gensim.models.doc2vec import Doc2Vec

def build_doc2vec_model(vector_size, window, min_count, epochs):
    model = Doc2Vec(vector_size=vector_size,
                    window=window,
                    min_count=min_count,
                    dm=1,  # Use PV-DM
                    epochs=epochs,
                    workers=4)
    
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model


def train_and_evaluate(doc2vec_model, X_train, X_test, y_train, y_test, resampling=None, rf_params=None):
    # Generate document vectors
    X_train_vecs = np.array([doc2vec_model.infer_vector(doc.words) for doc in X_train])
    X_test_vecs = np.array([doc2vec_model.infer_vector(doc.words) for doc in X_test])
    
    # Apply resampling if specified
    if resampling == 'smote':
        smote = SMOTE(random_state=42)
        X_train_vecs, y_train = smote.fit_resample(X_train_vecs, y_train)
    elif resampling == 'undersampling':
        rus = RandomUnderSampler(random_state=42)
        X_train_vecs, y_train = rus.fit_resample(X_train_vecs, y_train)
    elif resampling == 'smotetomek':
        smt = SMOTETomek(random_state=42)
        X_train_vecs, y_train = smt.fit_resample(X_train_vecs, y_train)
    
    # Train Random Forest Regressor with hyperparameters
    rf = RandomForestRegressor(**rf_params, random_state=42)
    rf.fit(X_train_vecs, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test_vecs)
    
    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return np.sqrt(mse), mae, r2

def objective(trial):
    # Define hyperparameters to optimize for Doc2Vec
    vector_size = trial.suggest_int('vector_size', 20, 50)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 10)
    epochs = trial.suggest_int('epochs', 10, 50, step=5)
    
    # Define hyperparameters to optimize for RandomForestRegressor
    n_estimators = trial.suggest_int('n_estimators', 30, 500, step=10)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    # Choose resampling method
    resampling = trial.suggest_categorical('resampling', ['none', 'smote', 'undersampling', 'smotetomek'])
    
    # Build and train Doc2Vec model
    model = build_doc2vec_model(vector_size, window, min_count, epochs)
    
    # Evaluate the model
    rf_params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf
    }
    rmse, mae, r2 = train_and_evaluate(model, X_train, X_test, y_train, y_test, resampling, rf_params)
    
    # We'll use MAE as our optimization metric, but you can change this based on your needs
    return r2

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, n_jobs=-1)

print('Best trial:')
trial = study.best_trial
print(f'  r2: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Train final model with best parameters
best_params = study.best_params
final_model = build_doc2vec_model(
    vector_size=best_params['vector_size'],
    window=best_params['window'],
    min_count=best_params['min_count'],
    epochs=best_params['epochs']
)

# Train and evaluate the final model
final_rf_params = {
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'min_samples_split': best_params['min_samples_split'],
    'min_samples_leaf': best_params['min_samples_leaf']
}

resampling_method = best_params['resampling'] if best_params['resampling'] != 'none' else None

# Evaluate the final model
rmse, mae, r2 = train_and_evaluate(final_model, X_train, X_test, y_train, y_test, resampling_method, final_rf_params)

print(f'Final Model Performance:')
print(f'  RMSE: {rmse}')
print(f'  MAE: {mae}')
print(f'  R2: {r2}')

[I 2024-06-27 15:32:08,451] A new study created in memory with name: no-name-c3d8853d-c82f-439a-9730-89268c1d3fdd
[I 2024-06-27 15:32:57,174] Trial 3 finished with value: 0.028576058695159445 and parameters: {'vector_size': 48, 'window': 4, 'min_count': 9, 'epochs': 30, 'n_estimators': 220, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 1, 'resampling': 'smotetomek'}. Best is trial 3 with value: 0.028576058695159445.
[I 2024-06-27 15:33:03,378] Trial 1 finished with value: -0.11215228261664234 and parameters: {'vector_size': 48, 'window': 8, 'min_count': 3, 'epochs': 30, 'n_estimators': 450, 'max_depth': 18, 'min_samples_split': 9, 'min_samples_leaf': 3, 'resampling': 'smote'}. Best is trial 3 with value: 0.028576058695159445.
[I 2024-06-27 15:33:07,663] Trial 2 finished with value: 0.16243969566111272 and parameters: {'vector_size': 30, 'window': 12, 'min_count': 1, 'epochs': 30, 'n_estimators': 30, 'max_depth': 21, 'min_samples_split': 7, 'min_samples_leaf': 2, 'resampl