In [2]:
from loguru import logger
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.infrastructure.nlp.tokenizers import SpacyTokenizerAdapter
from mh_nlp.application.use_cases.build_clean_dataset import BuildCleanDatasetUseCase


# 0. D√©finir le mapping des labels
label_mapping = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Bipolar": 4,
    "Stress": 5,
    "Personality disorder": 6
}

filtered_label_mapping = {
    "Anxiety": 0, 
    "Normal": 1, 
    "Depression": 2,
}

# 1. Setup des composants (Wiring)
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=filtered_label_mapping,
)
cleaner = TextCleaner(engine=SpacyTokenizerAdapter(model="en_core_web_sm"))

# 2. Initialisation du Use Case
use_case = BuildCleanDatasetUseCase(repository=repo, cleaner=cleaner)

# 3. Ex√©cution
dataset_final = use_case.execute()

# Maintenant, dataset_final.texts et dataset_final.labels sont pr√™ts pour Sklearn ou PyTorch !

[32m2026-01-15 12:49:11.694[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.tokenizers[0m:[36m__init__[0m:[36m33[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' charg√© avec succ√®s.[0m
[32m2026-01-15 12:49:11.694[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.text_cleaner[0m:[36m__init__[0m:[36m37[0m - [34m[1mTextCleaner initialis√© avec SpacyTokenizerAdapter (Batch mode: False)[0m
[32m2026-01-15 12:49:11.694[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.text_cleaner[0m:[36m__init__[0m:[36m39[0m - [34m[1mTextCleaner initialis√© avec SpacyTokenizerAdapter[0m
[32m2026-01-15 12:49:11.696[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.build_clean_dataset[0m:[36mexecute[0m:[36m32[0m - [1mApplication : D√©marrage du Use Case BuildCleanDataset.[0m
[32m2026-01-15 12:49:11.696[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m33[0m - [1mD√©marrage du chargeme

In [2]:
from mh_nlp.domain.entities.document import Document
from mh_nlp.infrastructure.nlp.fast_tokenizers import FastSpacyTokenizerAdapter
from mh_nlp.domain.services.fast_text_cleaner import FastTextCleaner

# 1. Initialisation
engine = FastSpacyTokenizerAdapter(model="en_core_web_sm", batch_size=10)
cleaner = FastTextCleaner(engine=engine)

# 2. Donn√©es de test (incluant un cas probl√©matique : le vide)
data = [
    # 1. Cas Standard
    Document(text="I feel very depressed and lonely today."),
    
    # 2. Cas Vide (d√©j√† test√©, mais crucial pour l'alignement)
    Document(text=""),
    
    # 3. Espaces uniquement (doit √™tre g√©r√© comme vide)
    Document(text="     "),
    
    # 4. Bruit Social Media massif
    Document(text="RT @user123: This is so sad!!! #mentalhealth #depression #help @charity_org"),
    
    # 5. URLs multiples et complexes
    Document(text="Found help here: https://support.org/help?id=123 and http://test.com/path"),
    
    # 6. Caract√®res sp√©ciaux et ponctuation excessive
    Document(text="HELP ME !!!... ??? (I am not okay) [urgent] *sigh*"),
    
    # 7. Chiffres et dates (doivent √™tre supprim√©s par ta regex [^a-z\s])
    Document(text="I have been suffering since 2010. 10/10 would not recommend."),
    
    # 8. M√©lange Majuscules/Minuscules (Case Sensitivity)
    Document(text="vErY ANXIOUS AND sTrEsSeD out right now."),
    
    # 9. Texte avec sauts de ligne et tabulations
    Document(text="First line.\nSecond line with\ttabs."),
    
    # 10. Texte tr√®s court (un seul mot)
    Document(text="Sad."),
    
    # 11. Texte tr√®s long (pour tester la performance du batch)
    Document(text="I am " + "so " * 100 + "tired."),
    
    # 12. Caract√®res non-ASCII / Emojis (doivent √™tre filtr√©s)
    Document(text="I am happy today! üòäüåü Love life! ‚ù§Ô∏è"),
    
    # 13. Contenu purement num√©rique
    Document(text="1234567890"),
    
    # 14. Contenu purement ponctuation
    Document(text="!!! ??? @@@ ###"),
    
    # 15. Stopwords uniquement (doit retourner une cha√Æne vide apr√®s spaCy)
    Document(text="I am the and of a"),
    
    # 16. Mots avec apostrophes (Contractions)
    Document(text="I can't sleep, I'm exhausted and shouldn't stay up."),
    
    # 17. Balises HTML (si ton scraping est imparfait)
    Document(text="<p>This is a <b>test</b> of HTML removal.</p>"),
    
    # 18. Texte avec r√©p√©tition de lettres (Slang)
    Document(text="I am soooooooo hhaapppyyyyyyy"),
    
    # 19. Format technique (Logs)
    Document(text="2023-01-01 12:00:00 [ERROR] User is feeling low"),
    
    # 20. Phrases interrogatives et exclamatives coll√©es
    Document(text="IsThisReal?NoItIsnt!HelpMe.")
]

# 3. Test du traitement par lot
print("--- D√©but du Batch Cleaning ---")
results = cleaner.clean_batch(data)

for i, res in enumerate(results):
    print(f"Doc {i} nettoy√© : '{res}'")

# V√©rification de l'alignement
assert len(results) == len(data), "ERREUR : La taille du dataset a chang√© !"

[32m2026-01-15 13:16:31.511[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.fast_tokenizers[0m:[36m__init__[0m:[36m33[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' pr√™t (Batch: 10).[0m
[32m2026-01-15 13:16:31.511[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.fast_text_cleaner[0m:[36m__init__[0m:[36m24[0m - [34m[1mFastTextCleaner (Batch Only) pr√™t avec FastSpacyTokenizerAdapter[0m
[32m2026-01-15 13:16:31.513[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.fast_tokenizers[0m:[36mprocess_batch[0m:[36m50[0m - [1mD√©marrage du traitement par lots (20 lignes).[0m


--- D√©but du Batch Cleaning ---


Batch Lemmatization (spaCy): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<00:00, 557.60it/s]
[32m2026-01-15 13:16:31.552[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.fast_tokenizers[0m:[36mprocess_batch[0m:[36m63[0m - [32m[1mTraitement massif termin√© avec succ√®s.[0m


Doc 0 nettoy√© : 'feel depressed lonely today'
Doc 1 nettoy√© : ''
Doc 2 nettoy√© : ''
Doc 3 nettoy√© : 'rt sad'
Doc 4 nettoy√© : 'find help'
Doc 5 nettoy√© : 'help okay urgent sigh'
Doc 6 nettoy√© : 'suffer recommend'
Doc 7 nettoy√© : 'anxious stress right'
Doc 8 nettoy√© : 'line second line tab'
Doc 9 nettoy√© : 'sad'
Doc 10 nettoy√© : 'tired'
Doc 11 nettoy√© : 'happy today love life'
Doc 12 nettoy√© : ''
Doc 13 nettoy√© : ''
Doc 14 nettoy√© : ''
Doc 15 nettoy√© : 't sleep m exhausted shouldn t stay'
Doc 16 nettoy√© : 'p b test b html removal p'
Doc 17 nettoy√© : 'soooooooo hhaapppyyyyyyy'
Doc 18 nettoy√© : 'error user feel low'
Doc 19 nettoy√© : 'isthisreal noitisnt helpme'


In [1]:
from loguru import logger

# Infrastructure
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.infrastructure.nlp.fast_tokenizers import FastSpacyTokenizerAdapter

# Domaine (Services)
# Note : On utilise FastTextCleaner pour b√©n√©ficier du traitement par lots
from mh_nlp.domain.services.fast_text_cleaner import FastTextCleaner

# Application (Use Case & DTO)
from mh_nlp.application.use_cases.build_clean_dataset import BuildCleanDatasetUseCase

# --- CONFIGURATION ---

# Mapping des √©tiquettes m√©tier vers les index num√©riques
FILTERED_LABEL_MAPPING = {
    "Anxiety": 0, 
    "Normal": 1, 
    "Depression": 2,
}

# --- 1. SETUP DES COMPOSANTS (WIRING) ---

# Initialisation du d√©p√¥t de donn√©es (Source : CSV Kaggle)
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=FILTERED_LABEL_MAPPING,
)

# Initialisation du moteur NLP haute performance (Port -> Adapter)
# batch_size=1000 permet d'optimiser les performances de spaCy sur CPU
spacy_engine = FastSpacyTokenizerAdapter(
    model="en_core_web_sm", 
    batch_size=1000
)

# Initialisation du service de nettoyage (Orchestrateur Domaine)
cleaner = FastTextCleaner(engine=spacy_engine)

# --- 2. INITIALISATION DU USE CASE ---

# Le Use Case re√ßoit ses d√©pendances par injection
use_case = BuildCleanDatasetUseCase(
    repository=repo, 
    cleaner=cleaner
)

# --- 3. EX√âCUTION ---

logger.info("D√©marrage du pipeline de pr√©paration des donn√©es...")

# Lancement du processus complet : Chargement -> Nettoyage Batch -> DTO
dataset_final = use_case.execute()

# --- 4. V√âRIFICATION ---

if dataset_final.total_processed > 0:
    logger.success(f"Dataset pr√™t pour l'entra√Ænement ! Taille : {dataset_final.total_processed}")
    print(f"Exemple du premier document : {dataset_final.documents[0][:100]}...")

[32m2026-01-15 13:50:19.097[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.fast_tokenizers[0m:[36m__init__[0m:[36m33[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' pr√™t (Batch: 1000).[0m
[32m2026-01-15 13:50:19.098[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.fast_text_cleaner[0m:[36m__init__[0m:[36m24[0m - [34m[1mFastTextCleaner (Batch Only) pr√™t avec FastSpacyTokenizerAdapter[0m
[32m2026-01-15 13:50:19.098[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [1mD√©marrage du pipeline de pr√©paration des donn√©es...[0m
[32m2026-01-15 13:50:19.099[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.build_clean_dataset[0m:[36mexecute[0m:[36m48[0m - [1mApplication : Lancement du pipeline de nettoyage de donn√©es.[0m
[32m2026-01-15 13:50:19.099[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m33[0m - [1mD√©marrage du chargement des donn√©es depuis 

Exemple du premier document : oh gosh...


In [None]:

from mh_nlp.infrastructure.nlp.fast_tokenizers import FastSpacyTokenizerAdapter

spacy_engine = FastSpacyTokenizerAdapter(model="en_core_web_sm",batch_size=1000)
print(getattr(spacy_engine, "batch_size", None) is not None)

filtered_label_mapping = {
    "Anxiety": 0, 
    "Normal": 1, 
    "Depression": 2,
}

# 1. Setup des composants (Wiring)
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=filtered_label_mapping,
)
spacy_engine = FastSpacyTokenizerAdapter(model="en_core_web_sm",batch_size=1000)
cleaner = TextCleaner(engine=SpacyTokenizerAdapter(model="en_core_web_sm"))

# 2. Initialisation du Use Case
use_case = BuildCleanDatasetUseCase(repository=repo, cleaner=cleaner)

# 3. Ex√©cution
dataset_final = use_case.execute()

[32m2026-01-15 13:19:51.159[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.fast_tokenizers[0m:[36m__init__[0m:[36m33[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' pr√™t (Batch: 1000).[0m
[32m2026-01-15 13:19:51.573[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.tokenizers[0m:[36m__init__[0m:[36m33[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' charg√© avec succ√®s.[0m
[32m2026-01-15 13:19:51.573[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.text_cleaner[0m:[36m__init__[0m:[36m37[0m - [34m[1mTextCleaner initialis√© avec SpacyTokenizerAdapter (Batch mode: False)[0m
[32m2026-01-15 13:19:51.574[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.text_cleaner[0m:[36m__init__[0m:[36m39[0m - [34m[1mTextCleaner initialis√© avec SpacyTokenizerAdapter[0m
[32m2026-01-15 13:19:51.574[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.build_clean_dataset[0m:[36mexecute[0m:[36m32[0m - [1mApplica

In [11]:
# main.py
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.domain.entities.document import Document
from mh_nlp.infrastructure.nlp.tokenizers import SpacyTokenizerAdapter, NltkTokenizerAdapter

raw_text = "The students are studying and checking their emails at https://univ.edu !"
document = Document(raw_text)

# --- CAS 1 : Utilisation de spaCy (Lemmatisation pr√©cise) ---
spacy_engine = SpacyTokenizerAdapter(model="en_core_web_sm")
cleaner_spacy = TextCleaner(engine=spacy_engine)
print(f"SpaCy Result: {cleaner_spacy.clean(document)}")
# Sortie attendue: "student study check email"


[32m2026-01-15 11:18:49.004[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.tokenizers[0m:[36m__init__[0m:[36m32[0m - [1mSpacyTokenizerAdapter : Mod√®le 'en_core_web_sm' charg√© avec succ√®s.[0m
[32m2026-01-15 11:18:49.005[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.domain.services.text_cleaner[0m:[36m__init__[0m:[36m33[0m - [34m[1mTextCleaner initialis√© avec SpacyTokenizerAdapter[0m


SpaCy Result: student study check email


In [None]:
# --- CAS 2 : Utilisation de NLTK avec Stemming (Plus agressif) ---
nltk_engine = NltkTokenizerAdapter(method="stem")
cleaner_nltk = TextCleaner(engine=nltk_engine)
print(f"NLTK Stem Result: {cleaner_nltk.clean(raw_text)}")
# Sortie attendue: "student studi check email" (Note le 'studi' tronqu√©)

# --- CAS 3 : Utilisation de NLTK avec Lemmatisation ---
nltk_lem = NltkTokenizerAdapter(method="lemmatize")
cleaner_lem = TextCleaner(engine=nltk_lem)
print(f"NLTK Lemma Result: {cleaner_lem.clean(raw_text)}")

In [None]:
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository

# 1. D√©finir le mapping des labels (nom -> index num√©rique)
label_mapping = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Bipolar": 4,
    "Stress": 5,
    "Personality disorder": 6
}

# 2. Charger les donn√©es
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=label_mapping
)
raw_data = repo.load()

# 3. Transformer en dataset exploitable
dataset = []
for document, label in raw_data:
    dataset.append({
        'text': document.text,
        'label': label.name,
        'label_id': label.index
    })

# Afficher quelques exemples
print(f"Nombre total d'exemples : {len(dataset)}")
print("\nPremiers exemples :")
for i, item in enumerate(dataset[:3]):
    print(f"\nExemple {i+1}:")
    print(f"  Texte: {item['text'][:100]}...")
    print(f"  Label: {item['label']} (ID: {item['label_id']})")

In [None]:
# Alternative : format simplifi√©
dataset = [
    (document.text, label.index) 
    for document, label in raw_data
]

# Exemple d'utilisation
texts, labels = zip(*dataset)
print(f"Nombre de textes : {len(texts)}")
print(f"Premier texte : {texts[0]}")
print(f"Premier label : {labels[0]}")

In [None]:
# S√©parer textes et labels
texts = [document.text for document, label in raw_data]
labels = [label.index for document, label in raw_data]
label_names = [label.name for document, label in raw_data]

print(f"Nombre d'exemples : {len(texts)}")
print(f"\nExemple 1:")
print(f"  Texte: {texts[0]}")
print(f"  Label: {label_names[0]} ({labels[0]})")

In [12]:
import spacy
import re
from tqdm import tqdm

# 1. Chargement optimis√© du mod√®le
# On d√©sactive le 'parser' (syntaxe) et le 'ner' (entit√©s nomm√©es) pour gagner en vitesse
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def fast_clean_spacy(texts):
    cleaned_texts = []
    
    # Pr√©traitement l√©ger par RegEx avant d'envoyer √† spaCy
    # On le fait sous forme de g√©n√©rateur pour √©conomiser la m√©moire
    def pre_process(t_list):
        for text in t_list:
            text = text.lower()
            text = re.sub(r'http\S+|www.\S+', '', text)
            text = re.sub(r'@\w+|#\w+', '', text)
            text = re.sub(r'[^a-z\s]', '', text)
            yield text

    # 2. Utilisation de nlp.pipe avec tqdm pour la barre de progression
    # nlp.pipe est beaucoup plus rapide que .apply() car il traite par lots (batches)
    for doc in tqdm(nlp.pipe(pre_process(texts), batch_size=500), 
                    total=len(texts), 
                    desc="Nettoyage spaCy"):
        
        # Lemmatisation et retrait des stopwords / espaces
        words = [token.lemma_ for token in doc if not token.is_stop and not token.is_space]
        cleaned_texts.append(' '.join(words))
        
    return cleaned_texts

# --- Application au DataFrame ---
# --- Donn√©es de test ---
test_sentences = [
    "I am running in the park and I love it! #happy", 
    "Check out this website: https://openai.com for more info.",
    "The cats are sitting on the mats. @someone",
    "Better late than never, but never late is better.",
    "I've been studying NLP with spaCy for 3 hours!"
]

# --- Ex√©cution du test ---
print("Lancement du test...")
cleaned_results = fast_clean_spacy(test_sentences)

# --- Affichage comparatif ---
for original, cleaned in zip(test_sentences, cleaned_results):
    print(f"\nORIGINAL : {original}")
    print(f"NETTOY√â  : {cleaned}")

Lancement du test...


Nettoyage spaCy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 94.08it/s]


ORIGINAL : I am running in the park and I love it! #happy
NETTOY√â  : run park love

ORIGINAL : Check out this website: https://openai.com for more info.
NETTOY√â  : check website info

ORIGINAL : The cats are sitting on the mats. @someone
NETTOY√â  : cat sit mat

ORIGINAL : Better late than never, but never late is better.
NETTOY√â  : well late late well

ORIGINAL : I've been studying NLP with spaCy for 3 hours!
NETTOY√â  : have study nlp spacy hour





In [13]:
import random

# --- G√©n√©rateur de 100 lignes de test ---
templates = [
    "I am running in the park with my {} dogs! #nature",
    "Check this link {} for more details on {} @user",
    "The {} is sitting on the {}, it's so {}!",
    "Learning {} with spaCy is {} than NLTK.",
    "I've been {} for {} hours now, it's {}."
]

words = ["cat", "data", "better", "running", "machine learning", "fast", "website", "amazing"]
urls = ["http://test.com", "https://example.org", "www.nlp-cool.io"]

# Cr√©ation de la liste de 100 phrases
test_sentences_100 = []
for i in range(100):
    tpl = random.choice(templates)
    # On remplit les trous {} avec des mots al√©atoires pour varier
    if tpl.count("{}") == 1:
        sentence = tpl.format(random.choice(words))
    elif tpl.count("{}") == 2:
        sentence = tpl.format(random.choice(urls), random.choice(words))
    else:
        sentence = tpl.format(random.choice(words), random.choice(words), random.choice(words))
    test_sentences_100.append(sentence)

# --- Ex√©cution avec la fonction fast_clean_spacy ---
print(f"Traitement de {len(test_sentences_100)} lignes...")
results = fast_clean_spacy(test_sentences_100)

# Affichage des 5 premiers r√©sultats pour v√©rification
print("\n--- Aper√ßu des r√©sultats (5 premiers) ---")
for i in range(5):
    print(f"Ligne {i+1} : {results[i]}")

Traitement de 100 lignes...


Nettoyage spaCy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 336.81it/s]


--- Aper√ßu des r√©sultats (5 premiers) ---
Ligne 1 : website sit amazing cat
Ligne 2 : run park data dog
Ligne 3 : learn spacy machine learning nltk
Ligne 4 : have cat well hour fast
Ligne 5 : learn spacy machine learning nltk





In [14]:
import pandas as pd
import numpy as np

# Construction d'un dataset h√©t√©rog√®ne
raw_texts = [
    "The AI revolution is transforming software engineering globally.", # Court
    "Natural Language Processing (NLP) allows machines to understand human speech." * 20, # Long/R√©p√©titif
    "I'm running, he runs, she ran; we've been running for miles in the rain.", # Conjugaisons (Test Lemme)
    "Contact us at support@example.com or visit https://nlp.spacy.io for docs.", # Entit√©s/URL
    "This is a very simple sentence." # Tr√®s court
]

# On multiplie pour atteindre 5000 lignes (suffisant pour voir la diff√©rence de perf)
dataset = raw_texts * 1000 
print(f"Dataset pr√™t : {len(dataset)} lignes.")

Dataset pr√™t : 5000 lignes.


In [None]:
import time
import psutil
import os
import pandas as pd
import matplotlib.pyplot as plt
from typing import List

# Simulation d'un dataset de test (remplacez par vos donn√©es r√©elles)
test_data = [
    "The quick brown fox jumps over the lazy dog and looks for more adventure." * 10
] * 1000  # 1000 documents de taille moyenne

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)  # Retourne en Mo

def run_benchmark(adapter, data: List[str], name: str):
    print(f"--- Benchmarking: {name} ---")
    
    start_mem = get_memory_usage()
    start_time = time.time()
    
    # Ex√©cution du traitement par lots
    results = adapter.process_batch(data)
    
    end_time = time.time()
    end_mem = get_memory_usage()
    
    duration = end_time - start_time
    mem_diff = end_mem - start_mem
    throughput = len(data) / duration
    
    return {
        "Mod√®le": name,
        "Temps total (s)": round(duration, 3),
        "Docs/seconde": round(throughput, 2),
        "M√©moire utilis√©e (Mo)": round(mem_diff, 2)
    }

# 1. Initialisation des adaptateurs
spacy_adapter = SpacyTokenizerAdapter(model="en_core_web_sm", batch_size=1000)
#nltk_adapter = NltkTokenizerAdapter(method="lemmatize")

# 2. Lancement des tests
results = []
results.append(run_benchmark(spacy_adapter, test_data, "spaCy (Optimized)"))
results.append(run_benchmark(nltk_adapter, test_data, "NLTK (WordNet)"))

# 3. Affichage des r√©sultats
df_results = pd.DataFrame(results)
print("\nTableau Comparatif :")
display(df_results)

# 4. Visualisation
df_results.set_index("Mod√®le")[["Temps total (s)", "Docs/seconde"]].plot(
    kind="bar", subplots=True, figsize=(10, 6), title="Performance Comparison"
)
plt.tight_layout()
plt.show()

In [15]:
import pandas as pd
import numpy as np

# Construction d'un dataset h√©t√©rog√®ne
raw_texts = [
    "The AI revolution is transforming software engineering globally.", # Court
    "Natural Language Processing (NLP) allows machines to understand human speech." * 20, # Long/R√©p√©titif
    "I'm running, he runs, she ran; we've been running for miles in the rain.", # Conjugaisons (Test Lemme)
    "Contact us at support@example.com or visit https://nlp.spacy.io for docs.", # Entit√©s/URL
    "This is a very simple sentence." # Tr√®s court
]

# On multiplie pour atteindre 5000 lignes (suffisant pour voir la diff√©rence de perf)
dataset = raw_texts * 1000 
print(f"Dataset pr√™t : {len(dataset)} lignes.")

import time
import spacy
from tqdm import tqdm

def benchmark_spacy_configs(texts):
    results = []
    
    # --- CONFIG 1 : NO OPTIMIZED (Unitaire + Tous composants) ---
    nlp_full = spacy.load("en_core_web_sm")
    start = time.time()
    res_1 = [ [t.lemma_ for t in nlp_full(txt) if not t.is_stop] for txt in tqdm(texts, desc="Mode: Unitaire Full") ]
    results.append({"Configuration": "Unitaire (Full Pipeline)", "Temps (s)": time.time() - start})

    # --- CONFIG 2 : SEMI OPTIMIZED (Batch + Tous composants) ---
    start = time.time()
    res_2 = []
    for doc in tqdm(nlp_full.pipe(texts, batch_size=1000), total=len(texts), desc="Mode: Batch Full"):
        res_2.append([t.lemma_ for t in doc if not t.is_stop])
    results.append({"Configuration": "Batch (Full Pipeline)", "Temps (s)": time.time() - start})

    # --- CONFIG 3 : FULL OPTIMIZED (Batch + Disabled Components) ---
    # C'est ce que fait votre classe SpacyTokenizerAdapter
    nlp_fast = spacy.load("en_core_web_sm", disable=["ner", "parser"])
    start = time.time()
    res_3 = []
    for doc in tqdm(nlp_fast.pipe(texts, batch_size=1000), total=len(texts), desc="Mode: Batch Optimized"):
        res_3.append([t.lemma_ for t in doc if not t.is_stop])
    results.append({"Configuration": "Batch (Optimized: No NER/Parser)", "Temps (s)": time.time() - start})

    return pd.DataFrame(results)

# Ex√©cution
df_bench = benchmark_spacy_configs(dataset)
display(df_bench)

Dataset pr√™t : 5000 lignes.


Mode: Unitaire Full: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [01:32<00:00, 54.30it/s]
Mode: Batch Full: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [01:11<00:00, 70.14it/s]
Mode: Batch Optimized: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [00:33<00:00, 149.14it/s]


Unnamed: 0,Configuration,Temps (s)
0,Unitaire (Full Pipeline),92.085369
1,Batch (Full Pipeline),71.28801
2,Batch (Optimized: No NER/Parser),33.527106
