In [1]:
# 필요한 패키지 불러오기
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# 데이터 로드
train_df = pd.read_csv('train_for_NLP.csv')
test_df = pd.read_csv('test_cleaned.csv')

# train 데이터에서 특성과 레이블 분리
X_train = train_df['combined_str']
y_train = train_df['target']

# test 데이터의 특성
X_test = test_df['combined_str']

# CountVectorizer를 사용하여 문서 벡터화
vectorizer = CountVectorizer(max_features=5000)  # 최대 5000개의 특징을 사용
X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()

# K-fold Cross Validation 설정
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [10, 20, -1],
    'num_leaves': [31, 64, 128]
}

# LightGBM 모델 설정
lgbm_model = LGBMClassifier(random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=kfold, scoring='accuracy', verbose=1)

# 모델 학습 및 최적 파라미터 탐색
grid_search.fit(X_train_bow, y_train)

# 최적의 파라미터 및 성능 확인
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_}")

# 최적의 모델로 테스트 데이터 예측
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test_bow)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred
submission_df.to_csv("submit_CountVectorizer_LightGBM.csv", index=False)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1797
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 665
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1757
[LightGBM] [Info] Number of data points in the train 