# 뉴스 그룹 분류 경진대회

## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords

In [2]:
# nltk.download('all') ## 다운로드 안되어 있으면 실행해야 한다

In [4]:
SEED = 22

In [5]:
df = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')
submission = pd.read_csv('./Data/sample_submission.csv')

## 텍스트 전처리

In [6]:
def remove_stopwords(text, stopwords_list):
    words = text.split()
    text_without_stopwords  = ' '.join([word for word in words if word not in stopwords_list]) ## 불용어 리스트에 포함되지 않는 단어만 선택하여 join
    return text_without_stopwords

In [7]:
stopwords_list = stopwords.words('english') ## nltk에서 제공하는 불용어사전 이용

In [8]:
df['text_without_stopwords'] = df['text'].apply(lambda text: remove_stopwords(text, stopwords_list))
test['text_without_stopwords'] = test['text'].apply(lambda text: remove_stopwords(text, stopwords_list))

## 모델 생성

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import ComplementNB

In [10]:
## accuracy 계산
def accuracy(true, pred):
    return sum(true == pred) / len(true)

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [12]:
train_tfidf = tfidf.fit_transform(df['text_without_stopwords'])
test_tfidf = tfidf.transform(test['text_without_stopwords'])

In [13]:
target = df['target']

In [14]:
k = 15
skfold = StratifiedKFold(k, shuffle=True, random_state=SEED)

In [15]:
CNB = ComplementNB(alpha=0.2) ## 모델 생성

## 모델 학습 및 test 예측

In [16]:
def OOF_predict(model, X, y, test, SKfold):
    """
    Out-Of-Fold 방식을 통해 test를 예측
    """ 
    model_valid_score = []
    model_predict = np.zeros(shape=(test.shape[0], len(y.unique())))

    for train_idx, valid_idx in SKfold.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]

        model.fit(X_train, y_train)

        y_predict = model.predict_proba(X_valid).argmax(-1) ## 각 클래스별 확률(valid)을 구하고 -1차원을 기준으로 argmax를 구한다(확률이 가장 높은 인덱스 번호를 반환)
        test_predict = model.predict_proba(test)  ## 각 클래스별 확률(test)을 구함

        model_valid_score.append(accuracy(y_predict, y_valid)) ## valid 정확도
        model_predict += test_predict / SKfold.n_splits ## test 예측값(k Fold)
        
    model_predict = model_predict.argmax(-1)              
    return model_valid_score, model_predict

In [17]:
CNB_valid_acc, CNB_test_pred = OOF_predict(CNB, train_tfidf, target, test_tfidf, skfold)

In [18]:
np.mean(CNB_valid_acc)

0.7870654277971353

`-` 예측 데이터 제출

In [19]:
submission['target'] = CNB_test_pred
submission.head()

Unnamed: 0,id,target
0,0,3
1,1,16
2,2,11
3,3,8
4,4,7


In [20]:
submission.to_csv('./Data/submission_CNB-OOF.csv', index=False)