In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
file_path = r'C:\\Users\\user\\Desktop\\hansol\\3rd_project\\NLP_new_\\NLP-project\\NLP project\\crawling_data\\(new)korean1-317.csv'

with open(file_path, 'r', encoding='utf-8') as file :
    raw_cv = csv.reader(file)
    c = list(raw_cv)

In [3]:
cover_letter = pd.DataFrame(data = c, columns=['text'])
cover_letter['score']=3
cover_letter.loc[:77, 'score'] = 4
cover_letter.loc[:21, 'score'] = 5
cover_letter.iloc[-142:, cover_letter.columns.get_loc('score')] = 2
cover_letter.iloc[-15:, cover_letter.columns.get_loc('score')] = 1

In [4]:
score_counts = cover_letter['score'].value_counts()

print(score_counts)

2    127
3     97
4     56
5     22
1     15
Name: score, dtype: int64


In [5]:
# score가 1, 2, 4, 5 행만 필터링
filtered_cover_letter = cover_letter[cover_letter['score'].isin([1, 2, 4, 5])]

# 텍스트와 라벨 분리
texts = filtered_cover_letter['text'].astype(str).tolist()
labels = filtered_cover_letter['score'].tolist()

In [6]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=123)

In [7]:
#################################################### Doc2Vec 모델 학습

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# 학습 데이터를 태그된 문서로 변환
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(X_train)]

# Doc2Vec 모델 설정 및 학습
doc2vec_model = Doc2Vec(
    vector_size=50,  # 벡터 차원 수
    window=5,         # 컨텍스트 윈도우 크기
    min_count=3,      # 최소 출현 빈도
    epochs=20,        # 학습 반복 횟수
    dm=1              # DM 모델 사용 (0이면 DBOW 모델 사용)
)   
 

# 모델 학습
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

####################################################### 벡터화 및 회귀 모델 학습

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 학습 데이터 벡터화
X_train_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
X_test_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]

# 회귀 모델 학습
regressor = RandomForestRegressor(n_estimators=100, random_state=123)
regressor.fit(X_train_vectors, y_train)

# 예측
y_pred = regressor.predict(X_test_vectors)

# 평가
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")

Mean Squared Error: 0.8445454545454543
Root Mean Squared Error: 0.9189915421512074
R² Score: 0.4101587301587303


In [10]:
from sklearn.model_selection import ParameterGrid

# 파라미터 그리드 설정
param_grid = {
    'vector_size': [50, 100, 150],
    'window': [3, 5, 7],
    'min_count': [1, 2, 3],
    'epochs': [20, 40, 60],
    'dm': [0, 1]
}

best_score = float('-inf')
best_params = None

for params in ParameterGrid(param_grid):
    model = Doc2Vec(
        vector_size=params['vector_size'],
        window=params['window'],
        min_count=params['min_count'],
        workers=4,
        epochs=params['epochs'],
        dm=params['dm']
    )
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    # 벡터화
    X_train_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
    X_test_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]
    
    # 회귀 모델 학습 및 평가
    regressor = RandomForestRegressor(n_estimators=100, random_state=123)
    regressor.fit(X_train_vectors, y_train)
    y_pred = regressor.predict(X_test_vectors)
    
    r2 = r2_score(y_test, y_pred)
    
    if r2 > best_score:
        best_score = r2
        best_params = params

print(f"Best R² Score: {best_score}")
print(f"Best Parameters: {best_params}")


Best R² Score: 0.4672650793650793
Best Parameters: {'dm': 1, 'epochs': 20, 'min_count': 2, 'vector_size': 50, 'window': 3}


In [11]:
from sklearn.model_selection import GridSearchCV

# 파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=123), param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train_vectors, y_train)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")
print(f"Best Parameters: {grid_search.best_params_}")


Mean Squared Error: 1.2484067049804162
Root Mean Squared Error: 1.1173212183523662
R² Score: 0.12809690445812216
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
