In [1]:
!pip -q install spacy nltk pandas numpy scikit-learn tqdm


In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

import spacy
from spacy.cli import download
download("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
#Télécharger & charger la data BBC
import os, zipfile, requests, io
import pandas as pd

URL = "http://mlg.ucd.ie/files/datasets/bbc.zip"
DATA_DIR = "bbc_data"

os.makedirs(DATA_DIR, exist_ok=True)

r = requests.get(URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(DATA_DIR)

print("Extracted to:", DATA_DIR)


Extracted to: bbc_data


In [8]:
import os
import glob
import pandas as pd


In [10]:
!wget http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
!unzip -q bbc-fulltext.zip


--2026-01-13 00:34:00--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874079 (2.7M) [application/zip]
Saving to: ‘bbc-fulltext.zip’


2026-01-13 00:34:01 (2.28 MB/s) - ‘bbc-fulltext.zip’ saved [2874079/2874079]



In [11]:
#Charger les fichiers (5 catégories)

labels = ["business", "entertainment", "politics", "sport", "tech"]
rows = []

for label in labels:
    files = glob.glob(os.path.join("bbc", label, "*.txt"))
    for fp in files:
        with open(fp, "r", encoding="latin-1") as f:
            text = f.read()
        rows.append({
            "label": label,
            "text": text
        })

df = pd.DataFrame(rows)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)



In [12]:
#Nettoyage

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))

def basic_clean(text: str) -> str:
    text = re.sub(r"<.*?>", " ", text)                      # HTML
    text = re.sub(r"http\S+|www\.\S+", " ", text)           # URLs
    text = re.sub(r"[^A-Za-z\s']", " ", text)               # caractères spéciaux
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_basic"] = df["text"].astype(str).apply(basic_clean)
df[["label","text_basic"]].head(2)


Unnamed: 0,label,text_basic
0,business,EU to probe Alitalia 'state aid' The European ...
1,business,Feta cheese battle reaches court A row over wh...


In [13]:
#Lemmatisation SpaCy + stopwords
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["ner"])  # NER pas nécessaire ici
tqdm.pandas()

def spacy_lemmatize(text: str) -> str:
    doc = nlp(text)
    tokens = []
    for t in doc:
        if t.is_space or t.is_punct:
            continue
        lemma = t.lemma_.lower()
        if lemma in STOPWORDS:
            continue
        if len(lemma) < 2:
            continue
        tokens.append(lemma)
    return " ".join(tokens)

df["text_clean"] = df["text_basic"].progress_apply(spacy_lemmatize)
df[["label","text_clean"]].head(3)


100%|██████████| 2225/2225 [02:00<00:00, 18.42it/s]


Unnamed: 0,label,text_clean
0,business,eu probe alitalia state aid european commissio...
1,business,feta cheese battle reach court row whether gre...
2,sport,yelling take cardiff hat trick european cross ...


In [14]:
#POS-tagging + dépendances

nlp_full = spacy.load("en_core_web_sm")  # avec parser

sample_text = df.loc[0, "text_basic"]
doc = nlp_full(sample_text)

# Afficher 20 tokens avec POS + DEP
[(t.text, t.lemma_, t.pos_, t.dep_, t.head.text) for t in doc[:20]]


[('EU', 'EU', 'PROPN', 'dep', 'launched'),
 ('to', 'to', 'PART', 'aux', 'probe'),
 ('probe', 'probe', 'VERB', 'acl', 'EU'),
 ('Alitalia', 'Alitalia', 'PROPN', 'poss', 'aid'),
 ("'", "'", 'PART', 'case', 'Alitalia'),
 ('state', 'state', 'NOUN', 'compound', 'aid'),
 ('aid', 'aid', 'NOUN', 'dobj', 'probe'),
 ("'", "'", 'PUNCT', 'punct', 'aid'),
 ('The', 'the', 'DET', 'det', 'Commission'),
 ('European', 'European', 'PROPN', 'compound', 'Commission'),
 ('Commission', 'Commission', 'PROPN', 'nsubj', 'launched'),
 ('has', 'have', 'AUX', 'aux', 'launched'),
 ('officially', 'officially', 'ADV', 'advmod', 'launched'),
 ('launched', 'launch', 'VERB', 'ROOT', 'launched'),
 ('an', 'an', 'DET', 'det', 'investigation'),
 ('in', 'in', 'ADJ', 'amod', 'investigation'),
 ('depth', 'depth', 'NOUN', 'pobj', 'in'),
 ('investigation', 'investigation', 'NOUN', 'dobj', 'launched'),
 ('into', 'into', 'ADP', 'prep', 'investigation'),
 ('whether', 'whether', 'SCONJ', 'mark', 'receiving')]

In [15]:
from collections import Counter

pos_counter = Counter()
dep_counter = Counter()

for text in tqdm(df["text_basic"].tolist()):
    doc = nlp_full(text)
    pos_counter.update([t.pos_ for t in doc if not t.is_space])
    dep_counter.update([t.dep_ for t in doc if not t.is_space])

pos_top = pd.DataFrame(pos_counter.most_common(15), columns=["POS","count"])
dep_top = pd.DataFrame(dep_counter.most_common(15), columns=["DEP","count"])

pos_top, dep_top


100%|██████████| 2225/2225 [02:53<00:00, 12.85it/s]


(      POS   count
 0    NOUN  178922
 1    VERB  111524
 2   PROPN  105677
 3     ADP  101165
 4     DET   81436
 5     ADJ   61736
 6     AUX   56013
 7    PRON   54023
 8    PART   30491
 9     ADV   30242
 10  CCONJ   25092
 11  SCONJ   16392
 12    NUM    6458
 13  PUNCT    1082
 14   INTJ     799,
          DEP  count
 0       prep  93368
 1       pobj  86329
 2   compound  81595
 3        det  80694
 4      nsubj  69697
 5       amod  53871
 6        aux  45770
 7       dobj  45236
 8      ccomp  36084
 9     advmod  33286
 10      conj  32867
 11        cc  25278
 12      poss  18307
 13     advcl  15247
 14     relcl  13866)

In [17]:
print("Colonnes df:", df.columns.tolist())
print("Shape:", df.shape)
df.head(2)


Colonnes df: ['label', 'text', 'text_basic', 'text_clean']
Shape: (2225, 4)


Unnamed: 0,label,text,text_basic,text_clean
0,business,EU to probe Alitalia 'state aid'\n\nThe Europe...,EU to probe Alitalia 'state aid' The European ...,eu probe alitalia state aid european commissio...
1,business,Feta cheese battle reaches court\n\nA row over...,Feta cheese battle reaches court A row over wh...,feta cheese battle reach court row whether gre...


In [18]:
#Export

OUT_PATH = "bbc_clean.csv"

df_out = df[["label", "text", "text_basic", "text_clean"]].copy()
df_out.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
print("Rows:", len(df_out))


Saved: bbc_clean.csv
Rows: 2225


In [19]:
df_out["n_tokens"] = df_out["text_clean"].str.split().apply(len)
df_out.groupby("label")["n_tokens"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
business,510.0,196.639216,78.792703,75.0,138.0,182.5,232.0,519.0
entertainment,386.0,195.07772,136.839955,83.0,135.0,162.0,212.0,1788.0
politics,417.0,256.211031,153.237305,48.0,181.0,256.0,300.0,2205.0
sport,511.0,184.207436,104.394275,61.0,115.0,156.0,225.5,940.0
tech,401.0,287.13217,125.198323,98.0,205.0,257.0,354.0,1460.0
