In [3]:
# 필요한 패키지 불러오기
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import gensim.downloader as api
from gensim.models import Word2Vec, FastText
import torch
from transformers import BertTokenizer, BertModel

# 데이터 로드
train_df = pd.read_csv('train_for_NLP.csv')
test_df = pd.read_csv('test_cleaned.csv')

# train 데이터에서 특성과 레이블 분리
X_train = train_df['combined_str']
y_train = train_df['target']

# test 데이터의 특성
X_test = test_df['combined_str']

# 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [0.01, 0.1, 1, 3],  # 규제 강도
    'penalty': ['l1', 'l2'],  # 규제 유형
    'solver': ['liblinear']   # 최적화 알고리즘
}


In [5]:
# Word2Vec 모델 학습 (Skipgram)
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, sg=1, workers=4)

# Word2Vec으로 문서 벡터화
def vectorize_w2v(text, model):
    words = text.split()
    # 문서 내 단어 중 모델에 있는 단어들의 벡터 평균을 계산
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    # 단어 벡터가 없는 경우(문서가 빈 경우 등)에는 0으로 채운 벡터 반환
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Word2Vec으로 변환된 데이터를 고정된 크기의 벡터로 변환
X_train_w2v = np.array([vectorize_w2v(text, w2v_model) for text in X_train])
X_test_w2v = np.array([vectorize_w2v(text, w2v_model) for text in X_test])

# 스케일링
scaler = StandardScaler()
X_train_w2v_scaled = scaler.fit_transform(X_train_w2v)
X_test_w2v_scaled = scaler.transform(X_test_w2v)

# GridSearchCV 초기화
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(X_train_w2v_scaled, y_train)

# 최적 파라미터 확인 및 Cross-Validation 평균 성능 출력
print("Word2Vec - Best Parameters:", grid_search.best_params_)
print("Word2Vec - Best CV Log Loss:", -grid_search.best_score_)

# 최적 모델로 테스트 데이터 예측 수행 (확률값)
best_model = grid_search.best_estimator_
y_test_pred_prob_w2v = best_model.predict_proba(X_test_w2v_scaled)[:, 1]
y_test_pred_w2v = (y_test_pred_prob_w2v >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_w2v
submission_df.to_csv("submit_Word2Vec_Logistic.csv", index=False)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Word2Vec - Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Word2Vec - Best CV Log Loss: 0.597086142403808


In [9]:
# GloVe 임베딩 로드
glove_model = api.load("glove-wiki-gigaword-100")

# GloVe로 문서 벡터화
def vectorize_glove(text, model):
    words = text.split()
    # 모델에 있는 단어들의 벡터 평균 계산
    word_vectors = [model[word] for word in words if word in model]
    
    # 단어 벡터가 없는 경우(문서가 빈 경우 등)에는 0으로 채운 벡터 반환
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(100)  # GloVe의 벡터 크기가 100으로 고정

# GloVe로 변환된 데이터를 고정된 크기의 벡터로 변환
X_train_glove = np.array([vectorize_glove(text, glove_model) for text in X_train])
X_test_glove = np.array([vectorize_glove(text, glove_model) for text in X_test])

# 스케일링
scaler = StandardScaler()
X_train_glove_scaled = scaler.fit_transform(X_train_glove)
X_test_glove_scaled = scaler.transform(X_test_glove)

# GridSearchCV 적용
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(X_train_glove_scaled, y_train)

# 최적 파라미터 및 성능 출력
print("GloVe - Best Parameters:", grid_search.best_params_)
print("GloVe - Best CV Log Loss:", -grid_search.best_score_)

# 테스트 데이터 예측
best_model = grid_search.best_estimator_
y_test_pred_prob_glove = best_model.predict_proba(X_test_glove_scaled)[:, 1]
y_test_pred_glove = (y_test_pred_prob_glove >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_glove
submission_df.to_csv("submit_GloVe_Logistic.csv", index=False)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
GloVe - Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
GloVe - Best CV Log Loss: 0.5493279476953994


In [7]:
# TF-IDF 벡터라이저 및 SVD
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svd = TruncatedSVD(n_components=300)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

# GridSearchCV 적용
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(X_train_svd, y_train)

# 최적 파라미터 및 성능 출력
print("TF-IDF + SVD - Best Parameters:", grid_search.best_params_)
print("TF-IDF + SVD - Best CV Log Loss:", -grid_search.best_score_)

# 테스트 데이터 예측
best_model = grid_search.best_estimator_
y_test_pred_prob_svd = best_model.predict_proba(X_test_svd)[:, 1]
y_test_pred_svd = (y_test_pred_prob_svd >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_svd
submission_df.to_csv("submit_TFIDF_SVD_Logistic.csv", index=False)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
TF-IDF + SVD - Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
TF-IDF + SVD - Best CV Log Loss: 0.6148522463448642


In [8]:
# FastText 모델 학습
fasttext_model = FastText(sentences, vector_size=100, window=5, sg=1, workers=4)

# FastText로 문서 벡터화
def vectorize_fasttext(text, model):
    words = text.split()
    vector = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return vector if vector is not None else np.zeros(model.vector_size)

X_train_fasttext = np.array([vectorize_fasttext(text, fasttext_model) for text in X_train])
X_test_fasttext = np.array([vectorize_fasttext(text, fasttext_model) for text in X_test])

# 스케일링
scaler = StandardScaler()
X_train_fasttext_scaled = scaler.fit_transform(X_train_fasttext)
X_test_fasttext_scaled = scaler.transform(X_test_fasttext)

# GridSearchCV 적용
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(X_train_fasttext_scaled, y_train)

# 최적 파라미터 및 성능 출력
print("FastText - Best Parameters:", grid_search.best_params_)
print("FastText - Best CV Log Loss:", -grid_search.best_score_)

# 테스트 데이터 예측
best_model = grid_search.best_estimator_
y_test_pred_prob_fasttext = best_model.predict_proba(X_test_fasttext_scaled)[:, 1]
y_test_pred_fasttext = (y_test_pred_prob_fasttext >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_fasttext
submission_df.to_csv("submit_FastText_Logistic.csv", index=False)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
FastText - Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
FastText - Best CV Log Loss: 0.5966648038068806


In [10]:
# BERT Tokenizer 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# BERT로 문서 임베딩 (고정된 차원으로 변환)
def vectorize_bert(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # 문서의 평균 풀링 (BERT의 출력에서 last_hidden_state 이용)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# BERT 임베딩 적용
X_train_bert = np.array([vectorize_bert(text, bert_model, tokenizer) for text in X_train])
X_test_bert = np.array([vectorize_bert(text, bert_model, tokenizer) for text in X_test])

# GridSearchCV 적용
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(np.vstack(X_train_bert), y_train)

# 최적 파라미터 및 성능 출력
print("BERT - Best Parameters:", grid_search.best_params_)
print("BERT - Best CV Log Loss:", -grid_search.best_score_)

# 테스트 데이터 예측
best_model = grid_search.best_estimator_
y_test_pred_prob_bert = best_model.predict_proba(np.vstack(X_test_bert))[:, 1]
y_test_pred_bert = (y_test_pred_prob_bert >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_bert
submission_df.to_csv("submit_BERT_Logistic.csv", index=False)




KeyboardInterrupt: 