In [5]:
! pip install -q pandas numpy catboost scipy matplotlib scikit-learn xgboost nltk colorlog pathlib spacy seaborn optuna mlflow ipywidgets

In [8]:
! spacy download en_core_web_sm

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
from src.logger import setup_logger, ROOT_DIR
from pathlib import Path
from dataclasses import dataclass

import logging

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


setup_logger(
    level=logging.INFO,
    stdout_log=True,
    file_log=False
)



In [10]:
df_news = pd.read_json(ROOT_DIR / 'data/bbc_articles.json')

In [12]:
df_news.sample(3)

Unnamed: 0,article_id,title,category,tags,summary,text
805,https://www.bbc.com/news/articles/c5yerz18wrpo,Views sought on Surrey countryside,articles,No Tags Available,Surrey County Council owns and manages over 10...,People have been asked to take part in a surve...
1087,https://www.bbc.com/news/videos/cn9375g2n8eo,Fly alongside a stunning starling murmuration,videos,No Tags Available,A pro-videographer waited weeks for the perfec...,A drone videographer has captured a starling m...
882,https://www.bbc.com/news/scotland/south_scotland,South Scotland,scotland,No Tags Available,"Get all the latest news, live updates and cont...",'Survival mode': Storm Éowyn left us without p...


In [None]:
@dataclass
class PreprocessParams:
    spacy_model: str = "en_core_web_sm"
    remove_punct: bool = True
    custom_punct: str = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    remove_stopwords: bool = True
    lemmatize: bool = True
    stem: bool = False
    lowercase: bool = True
    min_token_length: int = 2

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.astype(str)
        if self.params.remove_punct:
            X = X.apply(lambda text: re.sub(self.params.custom_punct, ' ', text))
            X = X.apply(lambda text: re.sub(r'\s+', ' ', text).strip())
        if self.params.lowercase:
            X = X.str.lower()
        return X

class SpacyTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params
        self.nlp = spacy.load(params.spacy_model, disable=["parser", "ner"])
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(lambda text: [token.text for token in self.nlp(text)])

class TokenProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, params: PreprocessParams):
        self.params = params
        self.stopwords = set(STOP_WORDS)
        self.nlp = spacy.load(params.spacy_model, disable=["parser", "ner"])
        self.stemmer = PorterStemmer() if params.stem else None
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        processed_tokens = []
        for tokens in X:
            filtered = []
            for token in tokens:
                # Length filtering
                if len(token) < self.params.min_token_length:
                    continue
                
                # Stopword removal
                if self.params.remove_stopwords and token.lower() in self.stopwords:
                    continue
                
                # Lemmatization/Stemming
                if self.params.lemmatize:
                    token = self.nlp(token)[0].lemma_
                elif self.params.stem:
                    token = self.stemmer.stem(token)
                
                filtered.append(token)
            processed_tokens.append(filtered)
        return pd.Series(processed_tokens)