# **Train text classifier model using *News Category Dataset***

* https://www.kaggle.com/datasets/rmisra/news-category-dataset

This Dataset contains 41 categories, and you can see that the data is reasonably unbalanced, so we will only use the data from 24 categories and use a subset of the data from each category for training.

## Process the data

In [1]:
with open('spacy_files/News_Category_Dataset_v3.json') as f:
    lines = f.readlines()
    
import json

categories_count = {}

# Iterar a través de cada diccionario en la lista
for dic in lines:
    category = json.loads(dic)['category']
    
    # Actualizar el recuento de la categoría
    if category in categories_count:
        categories_count[category] += 1
    else:
        categories_count[category] = 1

print('CATEGORIES IN THE DATASET')
sorted(categories_count.items(), key=lambda x: x[1], reverse=True)


CATEGORIES IN THE DATASET


[('POLITICS', 35602),
 ('WELLNESS', 17945),
 ('ENTERTAINMENT', 17362),
 ('TRAVEL', 9900),
 ('STYLE & BEAUTY', 9814),
 ('PARENTING', 8791),
 ('HEALTHY LIVING', 6694),
 ('QUEER VOICES', 6347),
 ('FOOD & DRINK', 6340),
 ('BUSINESS', 5992),
 ('COMEDY', 5400),
 ('SPORTS', 5077),
 ('BLACK VOICES', 4583),
 ('HOME & LIVING', 4320),
 ('PARENTS', 3955),
 ('THE WORLDPOST', 3664),
 ('WEDDINGS', 3653),
 ('WOMEN', 3572),
 ('CRIME', 3562),
 ('IMPACT', 3484),
 ('DIVORCE', 3426),
 ('WORLD NEWS', 3299),
 ('MEDIA', 2944),
 ('WEIRD NEWS', 2777),
 ('GREEN', 2622),
 ('WORLDPOST', 2579),
 ('RELIGION', 2577),
 ('STYLE', 2254),
 ('SCIENCE', 2206),
 ('TECH', 2104),
 ('TASTE', 2096),
 ('MONEY', 1756),
 ('ARTS', 1509),
 ('ENVIRONMENT', 1444),
 ('FIFTY', 1401),
 ('GOOD NEWS', 1398),
 ('U.S. NEWS', 1377),
 ('ARTS & CULTURE', 1339),
 ('COLLEGE', 1144),
 ('LATINO VOICES', 1130),
 ('CULTURE & ARTS', 1074),
 ('EDUCATION', 1014)]

In [2]:
interesting_categories = ['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY', 'HEALTHY LIVING', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS', 'HOME & LIVING', 'WEDDINGS', 'WOMEN', 'CRIME', 'MEDIA',  'RELIGION', 'STYLE', 'SCIENCE', 'TECH', 'MONEY', 'ARTS', 'ENVIRONMENT','WORLD NEWS', 'GOOD NEWS', 'WEIRD NEWS', 'COLLEGE', 'EDUCATION']

data_dict = {category: [] for category in interesting_categories}

for line in lines : 
    line_dict = json.loads(line)
    if line_dict["category"] in interesting_categories :
        data_dict[line_dict["category"]].append(line_dict["headline"])
    


In [3]:
# join COLLEGE with COLLEGE
data_dict['EDUCATION'].extend(data_dict.pop('COLLEGE'))

# join all NEWS Categories
data_dict['NEWS'] = []
data_dict['NEWS'].extend(data_dict.pop('WEIRD NEWS'))
data_dict['NEWS'].extend(data_dict.pop('GOOD NEWS'))
data_dict['NEWS'].extend(data_dict.pop('WORLD NEWS'))

# see the instances for each categories 
sorted([(key,len(data_dict[key])) for key in data_dict.keys() ], key=lambda x: x[1], reverse=True)


[('POLITICS', 35602),
 ('WELLNESS', 17945),
 ('ENTERTAINMENT', 17362),
 ('TRAVEL', 9900),
 ('STYLE & BEAUTY', 9814),
 ('NEWS', 7474),
 ('HEALTHY LIVING', 6694),
 ('FOOD & DRINK', 6340),
 ('BUSINESS', 5992),
 ('COMEDY', 5400),
 ('SPORTS', 5077),
 ('HOME & LIVING', 4320),
 ('WEDDINGS', 3653),
 ('WOMEN', 3572),
 ('CRIME', 3562),
 ('MEDIA', 2944),
 ('RELIGION', 2577),
 ('STYLE', 2254),
 ('SCIENCE', 2206),
 ('EDUCATION', 2158),
 ('TECH', 2104),
 ('MONEY', 1756),
 ('ARTS', 1509),
 ('ENVIRONMENT', 1444)]

To avoid some imbalance in the data we will only use a maximum of 5500 of each category for training. 

In [4]:
import random
from sklearn.model_selection import train_test_split

INSTANCE_MAX = 5500
data = []
category = []
for key, values_list in data_dict.items():
    random.shuffle(values_list)
    data.extend(values_list[:INSTANCE_MAX])
    category.extend([key] * len(values_list[:INSTANCE_MAX]))


X_train, X_test, y_train, y_test = train_test_split(
    data, category, test_size=0.20, stratify=category, random_state=42
)

X_dev, X_test, y_dev, y_test = train_test_split(
    X_test, y_test, test_size=0.3, stratify=y_test, random_state=42
)

print(f'SHAPE OF DATA: \nNUMBER OF CATEGORIES: {len(data_dict)}\nTRAIN: {len(X_train)}  VALIDATION: {len(X_dev)}  EVALUATION: {len(X_test)}')


SHAPE OF DATA: 
NUMBER OF CATEGORIES: 24
TRAIN: 75228  VALIDATION: 13165  EVALUATION: 5643


In [None]:
! pip install -U pip setuptools wheel
! pip install -U 'spacy[cuda12x,transformers,lookups]'

In [17]:
import spacy
from spacy.tokens import DocBin

# to use GPU, but need Cupy installed and compatible wirh CUDAS
# spacy.require_gpu() 


def convert(text_list: list, label_list: list, outfile: str):
    """Performing the data using the DocBin structure, which makes
    data manipulations in spaCy more efficient and save the data in disk.
    """
    nlp = spacy.blank("en")
    db = DocBin()
    for text, label in zip(text_list, label_list):
        doc = nlp.make_doc(text)
        doc.cats = {cat: 0 for cat in data_dict.keys()}
        doc.cats[label] = 1
        db.add(doc)
    db.to_disk(outfile)


convert(X_train, y_train, "spacy_files/news_train.spacy")
convert(X_dev, y_dev, "spacy_files/news_dev.spacy")
convert(X_test, y_test, "spacy_files/news_test.spacy")


## Model training

In [97]:
!python -m spacy init config --pipeline textcat_multilabel train_config.cfg


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat_multilabel
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
train_config.cfg
You can now add your data and train your pipeline:
python -m spacy train train_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [18]:
! python -m spacy train spacy_files/train_config.cfg --paths.train spacy_files/news_train.spacy  --paths.dev spacy_files/news_dev.spacy --output spacy_files/24categories_news_model --verbose

[2024-03-10 03:15:06,550] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: spacy_files/24categories_news_model[0m
[38;5;4mℹ Saving to output directory: spacy_files/24categories_news_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[2024-03-10 03:15:07,009] [INFO] Set up nlp object from config
[2024-03-10 03:15:07,016] [DEBUG] Loading corpus from path: spacy_files/news_dev.spacy
[2024-03-10 03:15:07,016] [DEBUG] Loading corpus from path: spacy_files/news_train.spacy
[2024-03-10 03:15:07,016] [INFO] Pipeline: ['textcat_multilabel']
[2024-03-10 03:15:07,019] [INFO] Created vocabulary
[2024-03-10 03:15:07,019] [INFO] Finished initializing nlp object
[2024-03-10 03:15:18,080] [INFO] Initialized pipeline components: ['textcat_multilabel']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2024-03-10 03:15:18,087] [DEBUG] Loading corpus from path: spacy_files/news_dev.spacy
[2024-03-10 03:15:18,088] [DEBUG] Loading corpus from path: spacy_files/news_train.s

## Model evaluate

In [19]:
!python -m spacy evaluate spacy_files/24categories_news_model/model-best/ spacy_files/news_test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK                   100.00
TEXTCAT (macro AUC)   94.10 
SPEED                 39741 

[1m

                     P       R       F
POLITICS         70.63   53.94   61.17
WELLNESS         59.72   38.18   46.58
ENTERTAINMENT    63.77   53.33   58.09
TRAVEL           77.91   60.91   68.37
STYLE & BEAUTY   76.90   70.61   73.62
HEALTHY LIVING   58.62   36.06   44.65
FOOD & DRINK     84.36   70.30   76.69
BUSINESS         65.24   41.52   50.74
COMEDY           72.05   50.93   59.67
SPORTS           73.70   74.43   74.06
HOME & LIVING    85.07   66.02   74.35
WEDDINGS         91.13   84.47   87.68
WOMEN            58.52   48.13   52.82
CRIME            72.58   63.08   67.50
MEDIA            69.18   57.06   62.54
RELIGION         74.22   61.29   67.14
STYLE            72.00   26.67   38.92
SCIENCE          70.51   41.67   52.38
TECH             57.76   53.17   55.37
MONEY            62.07   34.29   44.17
ARTS             54.84   18.68   27.87
ENVIRONMENT      

## Load the model

In [4]:
nlp = spacy.load("spacy_files/24categories_news_model/model-best")
doc=nlp("Women entrepreneurs play a pivotal role in shaping industries, fostering economic growth, and driving positive change in communities worldwide.")
doc.cats

{'POLITICS': 0.04754519462585449,
 'WELLNESS': 0.001057898043654859,
 'ENTERTAINMENT': 0.004479375202208757,
 'TRAVEL': 0.0006009417702443898,
 'STYLE & BEAUTY': 0.0008289911784231663,
 'HEALTHY LIVING': 0.00559990806505084,
 'FOOD & DRINK': 2.574278914835304e-05,
 'BUSINESS': 0.4946499466896057,
 'COMEDY': 0.001905512879602611,
 'SPORTS': 0.0007069869316183031,
 'HOME & LIVING': 0.0002976031100843102,
 'WEDDINGS': 6.09952439845074e-05,
 'WOMEN': 0.05362309515476227,
 'CRIME': 0.004431582521647215,
 'MEDIA': 0.09137539565563202,
 'RELIGION': 0.005778406746685505,
 'STYLE': 0.0006475687841884792,
 'SCIENCE': 0.002787437289953232,
 'TECH': 0.26404523849487305,
 'MONEY': 0.0023959362879395485,
 'ARTS': 0.0034477850422263145,
 'ENVIRONMENT': 0.0014956939266994596,
 'EDUCATION': 0.007798791863024235,
 'NEWS': 0.004413879942148924}

## Build the python package

In [3]:
! python -m spacy package resources/24categories_news_model/model-best/ resources --name news_24category --version 0.1 -b sdist,wheel

[38;5;3m⚠ Generating packages without the 'build' package is deprecated and
will not be supported in the future. To install 'build': pip install build[0m
[38;5;4mℹ Building package artifacts: sdist, wheel[0m
[38;5;2m✔ Loaded meta.json from file[0m
resources\24categories_news_model\model-best\meta.json
[38;5;2m✔ Generated README.md from meta.json[0m
[38;5;2m✔ Successfully created package directory 'en_news_24category-0.1'[0m
resources\en_news_24category-0.1
[38;5;3m⚠ Creating sdist with 'python -m build' failed. Falling back to
deprecated use of 'python setup.py sdist'[0m
running sdist
running egg_info
creating en_news_24category.egg-info
writing en_news_24category.egg-info\PKG-INFO
writing dependency_links to en_news_24category.egg-info\dependency_links.txt
writing entry points to en_news_24category.egg-info\entry_points.txt
writing requirements to en_news_24category.egg-info\requires.txt
writing top-level names to en_news_24category.egg-info\top_level.txt
writing manifest 

c:\Users\Manue!_PC\AppData\Local\Programs\Python\Python310\python.exe: No module named build


c:\Users\Manue!_PC\AppData\Local\Programs\Python\Python310\python.exe: No module named build
