# 뉴스 그룹 분류 경진대회

`-` 참고: https://dacon.io/competitions/official/235884/codeshare/4739?page=1&dtype=recent

`-` 참고: https://dacon.io/competitions/official/235864/codeshare/4246?page=1&dtype=recent

## 패키지 import및 데이터 전치리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
# nltk.download('all')

In [2]:
SEED = 22

In [3]:
os.chdir('C:\\Users\\Jaesu\\github_desktop\\Dacon-Basic\\뉴스-그룹-분류')

In [4]:
df = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')
submission = pd.read_csv('./Data/sample_submission.csv')

In [276]:
def clean_text(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>]', '', texts[i]) ## @%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+','', review) ## 숫자 제거
        review = review.lower() ## 소문자 변환
        review = re.sub(r'\s+', ' ', review) ## extra space 제거
        review = re.sub(r'<[^>]+>','',review) ## Html tags 제거
        review = re.sub(r'\s+', ' ', review) ## spaces 제거
        review = re.sub(r"^\s+", '', review) ## space from start 제거
        review = re.sub(r'\s+$', '', review) ## space from the end 제거
        review = re.sub(r'_', ' ', review) ## space from the end 제거
        corpus.append(review) 

    return corpus

In [254]:
df['clean_text'] = clean_text(df['text'])
test['clean_text'] = clean_text(test['text'])

In [7]:
def remove_stopwords(df_text, stopwords_list):
    df_words = df_text.split()
    df_text_without_stopwords  = ' '.join([df_word for df_word in df_words if df_word not in stopwords_list])
    return df_text_without_stopwords

In [8]:
stopwords_list = stopwords.words('english') ## nltk에서 제공하는 불용어사전 이용

In [262]:
df['clean_text'] = df['clean_text'].apply(lambda text: remove_stopwords(text, stopwords_list))
test['clean_text'] = test['clean_text'].apply(lambda text: remove_stopwords(text, stopwords_list))

## 스태킹 (Stacking)

In [142]:
import rich 
from rich.table import Table

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

In [13]:
## accuracy 계산
def accuracy(true, pred):
    return sum(true == pred) / len(true)

In [14]:
def get_pipe(model, model_name: str) -> Pipeline:
    """TfidfVectorizer와 모델을 연결한 파이프라인을 반환하는 함수"""
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
    pipe = Pipeline([
        ('tfidf', tfidf),
        (model_name, model)
    ])
    return pipe

In [15]:
def get_kfold_accuarcy(model, k: int = 5) -> float:
    """모델을 입력받아 KFold 예측 후 accuracy score를 반환하는 함수"""
    skfold = StratifiedKFold(k, shuffle=True, random_state=SEED)
    acc_list = []
    for train_idx, valid_idx in skfold.split(df['clean_text'], df['target']):
        print('training......')
        train, valid = df.iloc[train_idx], df.iloc[valid_idx]
        model.fit(train['clean_text'], train['target'])
        pred = model.predict(valid['clean_text'])
        acc = accuracy(valid['target'], pred)
        acc_list.append(acc)

    return np.mean(acc_list)

In [25]:
models = [
    ('SGD', SGDClassifier(random_state=SEED, n_jobs=-1)),
    ('rfc', RandomForestClassifier(random_state=SEED, min_samples_split=10, n_jobs=-1)),
    ('logistic', LogisticRegression(random_state=SEED, max_iter=500)),
]

In [20]:
model_pipes = [(model_name, get_pipe(model, model_name)) for model_name, model in models]

`-` 부스팅 모델(xgboost, lgbm, catboost)을 쓰면 너무너무 느리다...(에러가 나는건 아닌데 무한 로딩 걸림)

`-` 생각해보니 학습할 파라미터 수가 너무 많다

In [21]:
table = Table(title='Model Comparison Table')
table.add_column('Model Name', justify='left', style='green')
table.add_column('Accuracy', justify='right')

for model_name, model in tqdm(model_pipes, leave=False):
    acc = get_kfold_accuarcy(model)
    table.add_row(model_name, f'{acc:0.4f}')

rich.print(table)

                                                                                                                       

In [24]:
from sklearn.ensemble import StackingClassifier

stack_models = [(model_name, get_pipe(model, model_name)) for model_name, model in models]
stacking = StackingClassifier(stack_models)
acc = get_kfold_accuarcy(stacking)
rich.print(acc)

training......


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training......


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training......


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training......


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training......


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
stacking.fit(df['clean_text'], df['target'])
pred = stacking.predict(test['clean_text'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
submission['target'] = pred
submission.head()

Unnamed: 0,id,target
0,0,3
1,1,16
2,2,11
3,3,8
4,4,7


In [28]:
submission.to_csv('./Data/submission2.csv', index=False)

`-` 점수는 0.6681109185

## 단일 모델

In [277]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [281]:
tfidf.fit(np.array(df['text']))

TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [282]:
train_tfidf = tfidf.transform(df['text'])
test_tfidf = tfidf.transform(test['text'])

In [283]:
target = df['target']

In [284]:
X_train, X_valid, y_train, y_valid = train_test_split(train_tfidf, target, train_size=0.8, test_size=0.2, random_state=SEED, stratify=target)

In [285]:
SGD = SGDClassifier(random_state=SEED, n_jobs=-1)
LSVC = LinearSVC(random_state=SEED, C=5)

In [286]:
LSVC.fit(X_train, y_train)
SGD.fit(X_train, y_train)

SGDClassifier(n_jobs=-1, random_state=22)

In [287]:
print(f'SGD의 valid accuracy: {accuracy(SGD.predict(X_valid), y_valid)}')
print(f'LSVC의 valid accuracy: {accuracy(LSVC.predict(X_valid), y_valid)}')

SGD의 valid accuracy: 0.7693557119653492
LSVC의 valid accuracy: 0.7639415268002165


`-` 전체 데이터로 다시 학습

In [288]:
LSVC.fit(train_tfidf, target)
SGD.fit(train_tfidf, target)

SGDClassifier(n_jobs=-1, random_state=22)

In [289]:
pred1 = SGD.predict(test_tfidf)
pred2 = LSVC.predict(test_tfidf)

In [299]:
submission['target'] = pred1
submission.head()

Unnamed: 0,id,target
0,0,3
1,1,16
2,2,11
3,3,8
4,4,13


In [297]:
submission.to_csv('./Data/submission_SGD.csv', index=False)

In [300]:
submission['target'] = pred2
submission.to_csv('./Data/submission_LSVC.csv', index=False)

`-` 점수는 0.7350519931

`-` 단일 모델이 위의 스태킹 모델보다 점수가 훨씬 잘 나왔다

`-` 텍스트를 숫자로 변환하는 방법이 다르고 스태킹에 성능이 낮은 모델이 섞여있어서 그런 것 같다

`-` 근데 텍스트를 숫자로 변환하는 방법의 차이가 영향이 더 큰 것 같다

## TF-IDF + Stacking

In [301]:
def get_pipe2(model, model_name: str) -> Pipeline:
    """TfidfVectorizer와 모델을 연결한 파이프라인을 반환하는 함수"""
    tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
    pipe = Pipeline([
        ('tfidf', tfidf),
        (model_name, model)
    ])
    return pipe

In [302]:
models = [
    ('SGD', SGDClassifier(random_state=SEED, n_jobs=-1)),
    ('LSVC', LinearSVC(random_state=SEED, C=5.0)),
]

In [303]:
stack_models = [(model_name, get_pipe2(model, model_name)) for model_name, model in models]
stacking = StackingClassifier(stack_models)

In [304]:
stacking.fit(df['text'], target)
pred = stacking.predict(test['text'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [305]:
submission['target'] = pred
submission.head()

Unnamed: 0,id,target
0,0,3
1,1,16
2,2,11
3,3,8
4,4,13


In [309]:
submission.to_csv('./Data/submission_stacking.csv', index=False)