In [1]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
file_path = r'C:\\Users\\user\\Desktop\\hansol\\3rd_project\\NLP_new_\\NLP-project\\NLP project\\crawling_data\\(new)korean1-317.csv'

with open(file_path, 'r', encoding='utf-8') as file :
    raw_cv = csv.reader(file)
    c = list(raw_cv)

for row in c:
    print(row)

In [3]:
cover_letter = pd.DataFrame(data = c, columns=['text'])
cover_letter['합격여부']='합격'
cover_letter.loc[178:, '합격여부'] = '불합격'

In [None]:
first_22 = cover_letter.iloc[:22] # 평점이 5점인 자소서
last_15 = cover_letter.iloc[-15:] # 평점이 1점인 자소서

star1n5 = pd.concat([first_22, last_15])

print(star1n5)

In [5]:
texts = star1n5['text'].astype(str).tolist()
labels = star1n5['합격여부'].tolist()

In [6]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=123)

In [8]:
# Doc2Vec 모델 학습
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import re

# nltk.download('punkt')

In [9]:
def nr_text(texts):
    n_min = 2
    corpus = []
    for a_line in texts :
        pre = re.sub(r'\W', ' ', a_line)
        pre = re.sub(r'_', ' ', pre)
        pre = re.sub(r'\d+','', pre) 
        pre = nltk.word_tokenize(pre)
        pre = [x for x in pre if len(x) >= n_min]
        corpus += pre
    return corpus

In [None]:
tkned_text = nr_text(texts)
tkned_text

In [11]:
# TaggedDocument 생성
tagged_data = [TaggedDocument(words=nouns, tags=[str(i)]) for i, nouns in enumerate(tkned_text)]

# Doc2Vec 모델 학습
doc2vec_model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=100)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [12]:
# 벡터화 및 분류 모델 학습
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 학습 데이터 벡터화
X_train_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
X_test_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]

# 분류 모델 학습
classifier = RandomForestClassifier(n_estimators=100, random_state=123)
classifier.fit(X_train_vectors, y_train)

# 예측
y_pred = classifier.predict(X_test_vectors)

# 평가
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

         불합격       1.00      0.67      0.80         3
          합격       0.83      1.00      0.91         5

    accuracy                           0.88         8
   macro avg       0.92      0.83      0.85         8
weighted avg       0.90      0.88      0.87         8



In [13]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [100, 200, 300], # 트리 개수
    'max_depth': [10, 20, 30], # 트리 최대 깊이
    'min_samples_split': [2, 5, 10], # 노드 분할 시 필요한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4], # 리프 노드 최소 샘플 수
    'max_features': ['sqrt', 'log2'] #  트리 분할 할때 고려- 제곱근(sqrt)과 로그(log2) 값 사용
}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,  # 교차 검증 3번
                           n_jobs=-1,  # 가능한 모든 프로세서를 사용하여 병렬 처리를 수행
                           verbose=2)  # 진행 상황을 출력

grid_search.fit(X_train_vectors, y_train)

print("Best parameters found: ", grid_search.best_params_)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best parameters found:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         5

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [14]:
# Random Search를 이용하여 하이퍼파라미터를 튜닝
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# 랜덤 포레스트 분류기 초기화
rf = RandomForestClassifier(random_state=123)

# 하이퍼파라미터 분포 설정
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}

# Random Search 설정
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, random_state=123, verbose=2)

# 학습 데이터로 Random Search 수행
random_search.fit(X_train_vectors, y_train)

# 최적의 하이퍼파라미터 출력
print("Best parameters found: ", random_search.best_params_)

# 최적의 모델로 테스트 데이터 예측
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

# 성능 평가
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'max_depth': 28, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 170}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         5

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

