In [1]:
# ! pip install -q pandas numpy tensorboard catboost scipy matplotlib scikit-learn xgboost nltk colorlog pathlib spacy seaborn optuna mlflow ipywidgets
# ! spacy download en_core_web_sm

In [2]:
from src.logger import setup_logger, ROOT_DIR

import logging

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from dataclasses import dataclass

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

setup_logger(
    level=logging.INFO,
    stdout_log=True,
    file_log=False
)



In [3]:
df_news = pd.read_json(ROOT_DIR / 'data/bbc_articles.json')

In [4]:
df_news.sample(3)

Unnamed: 0,article_id,title,category,tags,summary,text
974,https://www.bbc.com/news/videos/cvg6zrvdq7qo,Watch: Huge waves strike Peruvian coastline,videos,No Tags Available,Videos show waves reported to be up to four me...,Three-quarters of all ports in Peru have been ...
111,https://www.bbc.com/news/business-51444789,"Storms: What are my rights if my home, travel ...",business_51444789,No Tags Available,Your rights explained as residents and busines...,Storms and wintry weather often cause extensiv...
373,https://www.bbc.com/news/videos/ckgrrx377xpo,Homes and roads swamped by Brazil floodwaters,videos,No Tags Available,A state of emergency has been declared in sout...,A state of emergency has been declared in sout...


In [5]:
@dataclass
class PreprocessParams:
    spacy_model: str = "en_core_web_sm"
    remove_punct: bool = True
    custom_punct: str = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    remove_stopwords: bool = True
    lemmatize: bool = True
    stem: bool = False
    lowercase: bool = True
    min_token_length: int = 2
    verbose: bool = False

@dataclass
class TrainingParams:
    test_size: float = 0.2
    random_state: int = 42
    shuffle_split: bool = True

In [6]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.Series) -> pd.Series:
        X = X.astype(str)
        if self.params.lowercase:
            X = X.str.lower()
        if self.params.remove_punct:
            punct_pattern = self.params.custom_punct + ']' if not self.params.custom_punct.endswith(']') else self.params.custom_punct
            X = X.str.replace(punct_pattern, ' ', regex=True)
        X = X.str.replace(r'\s+', ' ', regex=True).str.strip()
        return X

class SpacyTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params
        self.nlp = spacy.load(params.spacy_model, disable=["parser", "ner"])

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.Series) -> pd.Series:
        tokenized = X.apply(lambda text: [token.text for token in self.nlp(str(text))])
        return tokenized

class TokenProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params
        self.stopwords = set(STOP_WORDS)
        self.nlp = spacy.load(params.spacy_model, disable=["parser", "ner"])
        self.stemmer = PorterStemmer() if params.stem else None

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.Series) -> pd.Series:
        def _process_row(row):
            filtered = []
            for token in row:
                if len(token) < self.params.min_token_length:
                    continue
                if self.params.remove_stopwords and token.lower() in self.stopwords:
                    continue
                if self.params.lemmatize:
                    token = self.nlp(token)[0].lemma_
                elif self.params.stem:
                    token = self.stemmer.stem(token)
                filtered.append(token)
            return ' '.join(filtered)

        processed = X.apply(_process_row)
        return processed

In [7]:
def create_text_pipeline(params: PreprocessParams):
    return Pipeline([
        ('cleaner', TextCleaner(params)),
        ('tokenizer', SpacyTokenizer(params)),
        ('processor', TokenProcessor(params)),
        ('vectorizer', TfidfVectorizer())
    ], verbose=params.verbose)

def get_feature_pipeline(params: PreprocessParams):
    text_columns = ['text', 'title', 'summary']
    text_transformers = [
        (f'{col}_pipeline', create_text_pipeline(params), col)
        for col in text_columns
    ]

    transformer = ColumnTransformer(
        transformers=[
            *text_transformers,
            ('tags_preprocess', OneHotEncoder(sparse_output=False), ['tags']),
        ],
        remainder='drop',
        verbose=params.verbose,
        sparse_threshold=1,
    )

    pipe = Pipeline(steps=[('column_processor', transformer)], verbose=params.verbose)
    if params.verbose:
        display(pipe)
    return pipe

In [8]:
def prepare_data_splits(
            _df: pd.DataFrame,
            _preprocess_params: PreprocessParams, 
            _train_params: TrainingParams):
               
        le = LabelEncoder()
        _df['category'] = le.fit_transform(_df['category'].astype(str))

        X = _df.drop(['category'], axis=1)
        y = _df.pop('category')

        pipe_feature = get_feature_pipeline(_preprocess_params)
        X_transformed = pipe_feature.fit_transform(X)

        _X_train, _X_test, _y_train, _y_test = train_test_split(
                X_transformed.toarray(),
                y,
                test_size=_train_params.test_size, 
                random_state=_train_params.random_state,
                shuffle=_train_params.shuffle_split
        )

        return _X_train, _X_test, _y_train, _y_test, pipe_feature, le

In [9]:
preprocess_params = PreprocessParams(
        spacy_model="en_core_web_sm",
        remove_punct=True,
        custom_punct=r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
        remove_stopwords=True,
        lemmatize=True,
        stem=False,
        lowercase=True,
        min_token_length=2,
        verbose=True
)

train_params = TrainingParams(
        test_size=0.2,
        random_state=42,
        shuffle_split=True
)

X_train, X_test, y_train, y_test, x_estim, y_estim = prepare_data_splits(
    df_news.copy(deep=True), 
    preprocess_params, 
    train_params
)

[Pipeline] ........... (step 1 of 4) Processing cleaner, total=   0.1s
[Pipeline] ......... (step 2 of 4) Processing tokenizer, total=  24.0s
[Pipeline] ......... (step 3 of 4) Processing processor, total= 5.7min
[Pipeline] ........ (step 4 of 4) Processing vectorizer, total=   0.1s
[ColumnTransformer] . (1 of 4) Processing text_pipeline, total= 6.1min
[Pipeline] ........... (step 1 of 4) Processing cleaner, total=   0.0s
[Pipeline] ......... (step 2 of 4) Processing tokenizer, total=   1.7s
[Pipeline] ......... (step 3 of 4) Processing processor, total=   6.5s
[Pipeline] ........ (step 4 of 4) Processing vectorizer, total=   0.0s
[ColumnTransformer]  (2 of 4) Processing title_pipeline, total=   8.2s
[Pipeline] ........... (step 1 of 4) Processing cleaner, total=   0.0s
[Pipeline] ......... (step 2 of 4) Processing tokenizer, total=   2.1s
[Pipeline] ......... (step 3 of 4) Processing processor, total=  10.6s
[Pipeline] ........ (step 4 of 4) Processing vectorizer, total=   0.0s
[Colum

In [11]:
from catboost import CatBoostClassifier

estim = CatBoostClassifier(task_type="GPU", devices='0')
estim.fit(X_train, y_train)

Learning rate set to 0.063811
0:	learn: 2.4510185	total: 4.15s	remaining: 1h 9m 5s
1:	learn: 2.0653385	total: 4.29s	remaining: 35m 40s
2:	learn: 1.8220999	total: 4.45s	remaining: 24m 37s
3:	learn: 1.6327605	total: 4.58s	remaining: 19m 1s
4:	learn: 1.4800560	total: 4.7s	remaining: 15m 34s
5:	learn: 1.3577042	total: 4.83s	remaining: 13m 20s
6:	learn: 1.2691382	total: 5.03s	remaining: 11m 53s
7:	learn: 1.1750961	total: 5.17s	remaining: 10m 40s
8:	learn: 1.0972338	total: 5.3s	remaining: 9m 43s
9:	learn: 1.0283072	total: 5.45s	remaining: 8m 59s
10:	learn: 0.9633508	total: 5.59s	remaining: 8m 22s
11:	learn: 0.9044351	total: 5.71s	remaining: 7m 50s
12:	learn: 0.8548807	total: 5.86s	remaining: 7m 25s
13:	learn: 0.8077302	total: 6s	remaining: 7m 2s
14:	learn: 0.7682417	total: 6.19s	remaining: 6m 46s
15:	learn: 0.7319812	total: 6.33s	remaining: 6m 29s
16:	learn: 0.6973888	total: 6.48s	remaining: 6m 14s
17:	learn: 0.6641441	total: 6.63s	remaining: 6m 1s
18:	learn: 0.6341115	total: 6.79s	remaining

<catboost.core.CatBoostClassifier at 0x7fea91e17b20>

In [15]:
X_train.shape

(920, 23474)

In [12]:
from sklearn.metrics import classification_report

y_pred = estim.predict(X_test)
pd.options.display.float_format = '{:,.2f}'.format
display(pd.DataFrame(classification_report(y_true=y_test, y_pred=y_pred, zero_division=0, output_dict=True)))

Unnamed: 0,2,9,12,15,17,18,22,25,29,30,32,34,accuracy,macro avg,weighted avg
precision,0.96,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.0,0.93,0.24,0.91
recall,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.76,0.0,0.5,0.0,0.93,0.27,0.93
f1-score,0.98,0.86,0.0,0.0,0.0,0.0,0.0,0.0,0.87,0.0,0.29,0.0,0.93,0.25,0.92
support,195.0,6.0,1.0,3.0,1.0,1.0,1.0,1.0,17.0,1.0,2.0,1.0,0.93,230.0,230.0
