In [8]:
import csv
import pandas as pd
import random as rd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize

In [3]:
file_path = r'C:\\Users\\user\\Desktop\\hansol\\3rd_project\\NLP_new_\\NLP-project\\NLP project\\crawling_data\\(new)korean1-317.csv'

with open(file_path, 'r', encoding='utf-8') as file :
    raw_cv = csv.reader(file)
    c = list(raw_cv)

In [4]:
cover_letter = pd.DataFrame(data = c, columns=['text'])
cover_letter['합격여부']='합격'
cover_letter.loc[178:, '합격여부'] = '불합격'

In [6]:
star1n2 = cover_letter.iloc[-76:] # 평점 2점까지( 갯수 균형 맞추기 )
star4n5 = cover_letter.iloc[:77] # 평점 4점까지
# star1n2 = cover_letter.iloc[-142:] # 평점 2점까지

In [14]:
pick_n = rd.sample(range(176,302), 62) # 2점 자소서 62개의 행 번호 추출
star1 = cover_letter.iloc[-15:]

star2 = []
for i, text in enumerate(cover_letter) :
    if i in pick_n:
        star2.append(text)

print(star2)

[]


In [None]:
star12n45 = pd.concat([star1n2,star4n5])

print(star12n45)

In [6]:
texts = star12n45['text'].astype(str).tolist() # 평점이 12 vs 45
labels = star12n45['합격여부'].tolist()

In [7]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=123)

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [10]:
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(X_train)]

In [11]:
# Doc2Vec 모델 설정 및 학습
doc2vec_model = Doc2Vec(
    vector_size=100,  # 벡터 차원 수
    window=5,         # 컨텍스트 윈도우 크기
    min_count=2,      # 최소 출현 빈도
    workers=4,        # 사용 스레드 수
    epochs=40,        # 학습 반복 횟수
    dm=1              # DM 모델 사용
)

In [12]:
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
X_test_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]

classifier = RandomForestClassifier(n_estimators=100, random_state=123)
classifier.fit(X_train_vectors, y_train)

y_pred = classifier.predict(X_test_vectors)

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

         불합격       0.80      0.80      0.80        15
          합격       0.81      0.81      0.81        16

    accuracy                           0.81        31
   macro avg       0.81      0.81      0.81        31
weighted avg       0.81      0.81      0.81        31



In [14]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [100, 200, 300], # 생성할 트리 개수
    'max_depth': [10, 20, 30], # 각 트리의 최대 깊이
    'min_samples_split': [2, 5, 10], # 내부 노드 분할 시 필요한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4], # 리프 노드 최소 샘플 수
    'max_features': ['sqrt', 'log2'] #  각 트리를 분할할 때 고려할 최대 특징 수입니다. 제곱근(sqrt)과 로그(log2) 값 사용
}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,  # 교차 검증 3번
                           n_jobs=-1,  # 가능한 모든 프로세서를 사용하여 병렬 처리를 수행
                           verbose=2)  # 진행 상황을 출력

grid_search.fit(X_train_vectors, y_train)

print("Best parameters found: ", grid_search.best_params_)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best parameters found:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.65      0.73      0.69        15
           1       0.71      0.62      0.67        16

    accuracy                           0.68        31
   macro avg       0.68      0.68      0.68        31
weighted avg       0.68      0.68      0.68        31



In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf = RandomForestClassifier(random_state=123)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, random_state=123, verbose=2)

random_search.fit(X_train_vectors, y_train)

print("Best parameters found: ", random_search.best_params_)

best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 176}
              precision    recall  f1-score   support

           0       0.73      0.73      0.73        15
           1       0.75      0.75      0.75        16

    accuracy                           0.74        31
   macro avg       0.74      0.74      0.74        31
weighted avg       0.74      0.74      0.74        31



In [16]:
from sklearn.svm import SVC # SVM

svm_cf = SVC(kernel='linear', random_state=123)
svm_cf.fit(X_train_vectors, y_train)

y_pred_svm = svm_cf.predict(X_test_vectors)
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

         불합격       0.69      0.60      0.64        15
          합격       0.67      0.75      0.71        16

    accuracy                           0.68        31
   macro avg       0.68      0.68      0.67        31
weighted avg       0.68      0.68      0.68        31



In [17]:
from sklearn.ensemble import VotingClassifier # 앙상블

voting_classifier = VotingClassifier(estimators=[
    ('rf', classifier),
    ('svc', svm_cf)
], voting='hard')

voting_classifier.fit(X_train_vectors, y_train)
y_pred_voting = voting_classifier.predict(X_test_vectors)
print(classification_report(y_test, y_pred_voting, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

         불합격       0.71      0.80      0.75        15
          합격       0.79      0.69      0.73        16

    accuracy                           0.74        31
   macro avg       0.75      0.74      0.74        31
weighted avg       0.75      0.74      0.74        31

