In [85]:
import pandas as pd
import re

from konlpy.tag import Okt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier

In [73]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [81]:
stopword = ['이','있','하','것','들','그','되','수','보','않','없','나','이나','을','를','은','는','가','에','에게','의','다','이다','하다']
okt = Okt()
tokenpattern = r"[^a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]"

vectorizer = TfidfVectorizer(min_df = 3,  # 최소 빈도수
                             stop_words=stopword, # 불용어
                             tokenizer=okt.morphs, # 토큰화 방식
                             token_pattern= tokenpattern  # 토큰화 패턴
)

In [88]:
vectorizer.fit(train.document)
X = vectorizer.transform(train.document)
X = X.toarray()
y = train.label

X_test = vectorizer.transform(test.document)
X_test = X_test.toarray()

kf = StratifiedKFold(shuffle=True)



In [90]:
patience = 300
verboses = 100

for train_index, val_index in kf.split(X, y):
    X_train_, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model = CatBoostClassifier(early_stopping_rounds= patience,
                           eval_metric='Accuracy',
                           learning_rate= 0.5,
                           verbose = verboses)
    model.fit(X_train_, y_train,
                eval_set = [(X_val, y_val)])

0:	learn: 0.5837500	test: 0.5710000	best: 0.5710000 (0)	total: 36.4ms	remaining: 36.3s
100:	learn: 0.9580000	test: 0.8170000	best: 0.8280000 (58)	total: 2.57s	remaining: 22.9s
200:	learn: 0.9985000	test: 0.8170000	best: 0.8280000 (58)	total: 4.99s	remaining: 19.8s
300:	learn: 0.9992500	test: 0.8200000	best: 0.8280000 (58)	total: 7.6s	remaining: 17.6s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.828
bestIteration = 58

Shrink model to first 59 iterations.
0:	learn: 0.5852500	test: 0.5730000	best: 0.5730000 (0)	total: 23.3ms	remaining: 23.3s
100:	learn: 0.9595000	test: 0.7890000	best: 0.7910000 (99)	total: 2.58s	remaining: 22.9s
200:	learn: 0.9987500	test: 0.8060000	best: 0.8140000 (199)	total: 4.99s	remaining: 19.8s
300:	learn: 0.9992500	test: 0.8060000	best: 0.8150000 (203)	total: 7.61s	remaining: 17.7s
400:	learn: 0.9997500	test: 0.8140000	best: 0.8200000 (336)	total: 9.99s	remaining: 14.9s
500:	learn: 0.9997500	test: 0.8270000	best: 0.8270000 (489)	total: 12.7

0.822

In [91]:
pred = model.predict(X_test)
pred

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

In [94]:
submission = pd.read_csv(r'C:\Users\JH\Desktop\영화 리뷰 감성 분석\Data\sample_submission.csv')
submission['label'] = pred
submission

submission.to_csv('../Data/submission06TFIDF.csv', index=False)