In [1]:
# ! pip install -q pandas numpy catboost scipy matplotlib scikit-learn xgboost nltk colorlog pathlib spacy seaborn optuna mlflow ipywidgets
# ! spacy download en_core_web_sm

In [2]:
from src.logger import setup_logger, ROOT_DIR
from pathlib import Path
from dataclasses import dataclass

import logging

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from src.ml_utils.config import PreprocessParams
from src.ml_utils.transformers import (
    TextCleaner, 
    SpacyTokenizer, 
    TokenProcessor
)

from sklearn.pipeline import Pipeline


setup_logger(
    level=logging.INFO,
    stdout_log=True,
    file_log=False
)



In [3]:
df_news = pd.read_json(ROOT_DIR / 'data/bbc_articles.json')

In [4]:
df_news

Unnamed: 0,article_id,title,category,tags,summary,text
0,https://www.bbc.com/news/articles/c0jnd4eg6ndo,Garmin users say smartwatches have stopped wor...,articles,No Tags Available,"Its Fenix 8 smartwatches, which retail for jus...",Smartwatch firm Garmin is facing a backlash af...
1,https://www.bbc.com/news/articles/cq8keqv7x54o,Thousands without power days after Storm Éowyn,articles,No Tags Available,Energy suppliers are working to restore power ...,Thousands of homes are still without power day...
2,https://www.bbc.com/news/articles/c74xwp8lxzno,Students win engineering apprenticeship awards,articles,No Tags Available,One student per engineering group year at a Gu...,Four Guernsey students have received Apprentic...
3,https://www.bbc.com/news/articles/c0ew28eq2g7o,Toddler nearly runs off cliff at Hawaii volcano,articles,No Tags Available,The little boy wandered off from his family an...,A Hawaii national park has issued a new warnin...
4,https://www.bbc.com/news/articles/c20px1e05w0o,Trump vows to leave Paris climate agreement an...,articles,No Tags Available,"The White House announces a ""national energy e...",President Donald Trump has once again vowed to...
...,...,...,...,...,...,...
1145,https://www.bbc.com/news/articles/cdxz2ywq0wpo,Stranded whale died despite marine rescue effort,articles,No Tags Available,The pilot whale was stranded near Stallingboro...,A whale died after it became stranded along a ...
1146,https://www.bbc.com/news/articles/cn01xjp0yq4o,Farm worker contracts human case of bird flu,articles,No Tags Available,The UK Health Security Agency says cases of bi...,A person working on a farm has contracted the ...
1147,https://www.bbc.com/news/articles/c047422e6y4o,"Contract could end after girl, 5, fined £1,000",articles,No Tags Available,Harrow Council has threatened to terminate a c...,A council has threatened to terminate a contra...
1148,https://www.bbc.com/news/articles/cx24nj78lpjo,Town counts cost as ferries resume after storm,articles,No Tags Available,A full daily service to and from Dublin has re...,The UK's second busiest passenger port is retu...


In [5]:
from sklearn.compose import ColumnTransformer

def NLP_pipe(params: PreprocessParams) -> Pipeline:
    return Pipeline([
        ('cleaner', TextCleaner(params)),
        ('tokenizer', SpacyTokenizer(params)),
        ('processor', TokenProcessor(params))
    ], verbose=True)

params = PreprocessParams(
        spacy_model="en_core_web_sm",
        remove_punct=True,
        custom_punct=r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
        remove_stopwords=True,
        lemmatize=True,
        lowercase=True
    )

ct = ColumnTransformer(
    [("text_preprocess", NLP_pipe(params), "text"),
     ("title", NLP_pipe(params), ["title"])],
     remainder='passthrough')

In [6]:
ct.fit_transform(df_news[['title', 'text']].sample(3))

[Pipeline] ........... (step 1 of 3) Processing cleaner, total=   0.0s
[Pipeline] ......... (step 2 of 3) Processing tokenizer, total=   0.0s


ValueError: Length of values (0) does not match length of index (1)