# 뉴스 그룹 분류 경진대회

`-` 참고: https://dacon.io/competitions/official/235884/codeshare/4739?page=1&dtype=recent

`-` 참고: https://dacon.io/competitions/official/235864/codeshare/4246?page=1&dtype=recent

## 패키지 import및 데이터 전치리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
# nltk.download('all')

In [2]:
SEED = 22

In [3]:
os.chdir('C:\\Users\\Jaesu\\github_desktop\\Dacon-Basic\\뉴스-그룹-분류')

In [4]:
df = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')
submission = pd.read_csv('./Data/sample_submission.csv')

In [5]:
def clean_text(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>]', '', texts[i]) ## @%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+','', review) ## 숫자 제거
        review = review.lower() ## 소문자 변환
        review = re.sub(r'\s+', ' ', review) ## extra space 제거
        review = re.sub(r'<[^>]+>','',review) ## Html tags 제거
        review = re.sub(r'\s+', ' ', review) ## spaces 제거
        review = re.sub(r"^\s+", '', review) ## space from start 제거
        review = re.sub(r'\s+$', '', review) ## space from the end 제거
        review = re.sub(r'_', ' ', review) ## space from the end 제거
        corpus.append(review) 

    return corpus

In [6]:
df['clean_text'] = clean_text(df['text'])
test['clean_text'] = clean_text(test['text'])

In [7]:
def remove_stopwords(df_text, stopwords_list):
    df_words = df_text.split()
    df_text_without_stopwords  = ' '.join([df_word for df_word in df_words if df_word not in stopwords_list])
    return df_text_without_stopwords

In [8]:
stopwords_list = stopwords.words('english') ## nltk에서 제공하는 불용어사전 이용

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda text: remove_stopwords(text, stopwords_list))
test['clean_text'] = test['clean_text'].apply(lambda text: remove_stopwords(text, stopwords_list))

## 모델링

In [10]:
import rich 
from rich.table import Table

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

  from pandas import MultiIndex, Int64Index


In [11]:
def get_pipe(model, model_name: str) -> Pipeline:
    """TfidfVectorizer와 모델을 연결한 파이프라인을 반환하는 함수"""
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
    pipe = Pipeline([
        ('tfidf', tfidf),
        (model_name, model)
    ])
    return pipe

In [12]:
models = [
    ('xgb', XGBClassifier(n_estimators=500, learning_rate=0.05, random_state=SEED,
                          max_depth=7, objective='softmax', num_class=20, use_label_encoder=False,
                          eval_metric='mlogloss')),
    ('cb', CatBoostClassifier(n_estimators=500, learning_rate=0.05, random_state=SEED, max_depth=7, objective='softmax')),
    ('lgbm', LGBMClassifier(n_estimators=500, learning_rate=0.05, random_state=SEED, max_depth=7, objective='softmax')),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
    ('MLP', MLPClassifier(random_state=SEED)),
]

In [13]:
# model_pipes = [(model_name, get_pipe(model, model_name)) for model_name, model in models]

`-` 학습이 중간에 멈춘다...(에러가 나는건 아닌데 무한 로딩 걸림)

`-` 생각해보니 학습할 파라미터 수가 너무 많아서 GPU 없으면 못할 듯

In [14]:
from sklearn.ensemble import StackingClassifier

# stack_models = [(model_name, get_pipe(model, model_name)) for model_name, model in models]

# stacking = StackingClassifier(stack_models)
# acc = return_kfold_accuarcy(stacking)
# rich.print(acc)

In [15]:
skfold = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)

In [21]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))

In [22]:
tfidf.fit(np.array(df['clean_text']))

TfidfVectorizer(ngram_range=(1, 3))

In [23]:
train_tfidf = tfidf.transform(df['clean_text'])
test_tfidf = tfidf.transform(test['clean_text'])

In [24]:
train_tfidf

<9233x1641825 sparse matrix of type '<class 'numpy.float64'>'
	with 2324459 stored elements in Compressed Sparse Row format>

`-` 파라미터 수가 160만을 넘는다...

In [25]:
lgbm = LGBMClassifier(n_estimators=500, learning_rate=0.05, random_state=SEED, max_depth=7)

In [26]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.05, random_state=SEED,
                    max_depth=7, objective='softmax', num_class=20, use_label_encoder=False,
                    eval_metric='mlogloss')

In [27]:
#lgbm.fit(train_tfidf, df['target'])

In [28]:
#pred = xgb.predict(test_tfidf)

In [29]:
# submission['target'] = pred
# submission.head()

In [30]:
#submission.to_csv('./Data/submission2.csv', index=False)