In [7]:
import warnings
import os
import pandas as pd

# Supprime les avertissements de d√©pr√©ciation de numpy/tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
warnings.filterwarnings('ignore', category=FutureWarning)

# Configuration de l'affichage pour Notebook
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [16]:

from mh_nlp.domain.entities.document import Document
from mh_nlp.domain.services.text_cleaner import TextCleaner

# ==========================================
# 1. INITIALISATION DU SERVICE
# ==========================================
# Instanciation du cleaner (Logique m√©tier du domaine)
cleaner = TextCleaner()

# ==========================================
# 2. PR√âPARATION DU JEU DE TEST (DATASET)
# ==========================================
# Liste exhaustive couvrant les cas limites (Edge Cases)
data = [
    # --- Groupe A : Cas Standards & √âmotions ---
    Document(text="I am NOT feeling good today... üòû"),
    Document(text="I feel anxious and I can't sleep."),
    Document(text="I am feeling very happy and energetic!"),
    Document(text="I feel very depressed and lonely today."),
    Document(text="vErY ANXIOUS AND sTrEsSeD out right now."),
    Document(text="Sad."),
    
    # --- Groupe B : Bruit Social Media & Web ---
    Document(text="Check this out: https://help.com/support @user123 #mentalhealth"),
    Document(text="RT @user123: This is so sad!!! #mentalhealth #depression #help @charity_org"),
    Document(text="Found help here: https://support.org/help?id=123 and http://test.com/path"),
    Document(text="<p>This is a <b>test</b> of HTML removal.</p>"),
    
    # --- Groupe C : Ponctuation & Caract√®res Sp√©ciaux ---
    Document(text="HELP ME !!!... ??? (I am not okay) [urgent] *sigh*"),
    Document(text="!!! ??? @@@ ###"),
    Document(text="I am happy today! üòäüåü Love life! ‚ù§Ô∏è"),
    Document(text="IsThisReal?NoItIsnt!HelpMe."),
    
    # --- Groupe D : Formats Techniques & Nombres ---
    Document(text="I have been suffering since 2010. 10/10 would not recommend."),
    Document(text="First line.\nSecond line with\ttabs."),
    Document(text="1234567890"),
    Document(text="2023-01-01 12:00:00 [ERROR] User is feeling low"),
    
    # --- Groupe E : Cas "Vides" ou Probl√©matiques ---
    Document(text="    "), 
    Document(text=" "),
    Document(text=""),
    Document(text="/ restlessness appears. Thanks!"),
    Document(text="I am " + "so " * 50 + "tired."), # Test de performance/longueur
    
    # --- Groupe F : Linguistique (Apostrophes & Slang) ---
    Document(text="I can't sleep, I'm exhausted and shouldn't stay up."),
    Document(text="I am soooooooo hhaapppyyyyyyy")
]

# ==========================================
# 3. EX√âCUTION DU PIPELINE DE NETTOYAGE
# ==========================================
results = []

for doc in data:
    # Appel de la m√©thode clean du domaine
    cleaned_text = cleaner.clean(doc)
    
    results.append({
        "Original": doc.text.strip()[:100] + "..." if len(doc.text) > 100 else doc.text,
        "Cleaned": cleaned_text,
        "Length_Before": len(doc.text),
        "Length_After": len(cleaned_text),
        "Status": "OK" if cleaned_text else "Empty"
    })

# ==========================================
# 4. VISUALISATION ET ANALYSE DES R√âSULTATS
# ==========================================
df_cleaned = pd.DataFrame(results)

print(f"Analyse de {len(data)} documents termin√©e.\n")
display(df_cleaned)



Analyse de 25 documents termin√©e.



Unnamed: 0,Original,Cleaned,Length_Before,Length_After,Status
0,I am NOT feeling good today... üòû,am not feeling good today,32,25,OK
1,I feel anxious and I can't sleep.,feel anxious and can't sleep,33,28,OK
2,I am feeling very happy and energetic!,am feeling very happy and energetic,38,35,OK
3,I feel very depressed and lonely today.,feel very depressed and lonely today,39,36,OK
4,vErY ANXIOUS AND sTrEsSeD out right now.,very anxious and stressed out right now,40,39,OK
5,Sad.,sad,4,3,OK
6,Check this out: https://help.com/support @user123 #mentalhealth,check this out,63,14,OK
7,RT @user123: This is so sad!!! #mentalhealth #depression #help @charity_org,rt this is so sad,75,17,OK
8,Found help here: https://support.org/help?id=123 and http://test.com/path,found help here and,73,19,OK
9,<p>This is a <b>test</b> of HTML removal.</p>,this is test of html removal,45,28,OK


In [17]:

from mh_nlp.infrastructure.nlp.spacy_tokenizer import SpacyTokenizer

documents = [Document(text) for text in df_cleaned.Cleaned]

spacy_tok = SpacyTokenizer(model_name="en_core_web_sm", lemmatize=True, remove_stop= True, batch_size=10)
spacy_output = spacy_tok.tokenize(documents)

for i in range(len(documents)):
    print(f"Text nettoy√© original: {documents[i].text}")
    print(f"Sortie spaCy (Lemmes): {spacy_output[i]}")
    print("--"*3)

    if i > 5:
        break


[32m2026-01-17 01:39:50.408[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m40[0m - [1mInitialisation de SpacyTokenizer : en_core_web_sm[0m
[32m2026-01-17 01:39:50.810[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m43[0m - [32m[1mMod√®le spaCy 'en_core_web_sm' pr√™t.[0m
[32m2026-01-17 01:39:50.811[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36mtokenize[0m:[36m64[0m - [1mTokenisation de 25 documents (Batch: 10)[0m
NLP Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:00<00:00, 700.87doc/s]

Text nettoy√© original: am not feeling good today
Sortie spaCy (Lemmes): ['feel', 'good', 'today']
------
Text nettoy√© original: feel anxious and can't sleep
Sortie spaCy (Lemmes): ['feel', 'anxious', 'sleep']
------
Text nettoy√© original: am feeling very happy and energetic
Sortie spaCy (Lemmes): ['feel', 'happy', 'energetic']
------
Text nettoy√© original: feel very depressed and lonely today
Sortie spaCy (Lemmes): ['feel', 'depressed', 'lonely', 'today']
------
Text nettoy√© original: very anxious and stressed out right now
Sortie spaCy (Lemmes): ['anxious', 'stress', 'right']
------
Text nettoy√© original: sad
Sortie spaCy (Lemmes): ['sad']
------
Text nettoy√© original: check this out
Sortie spaCy (Lemmes): ['check']
------





In [18]:
from mh_nlp.infrastructure.nlp.spacy_tokenizer import SpacyTokenizer

documents = [Document(text) for text in df_cleaned.Cleaned]

spacy_tok = SpacyTokenizer(model_name="en_core_web_sm", lemmatize=True, batch_size=10)
spacy_output = spacy_tok.tokenize(documents)

for i in range(len(documents)):
    print(f"Text nettoy√© original: {documents[i].text}")
    print(f"Sortie spaCy (Lemmes): {spacy_output[i]}")
    print("--"*3)

    if i > 5:
        break


[32m2026-01-17 01:39:51.062[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m40[0m - [1mInitialisation de SpacyTokenizer : en_core_web_sm[0m
[32m2026-01-17 01:39:51.329[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m43[0m - [32m[1mMod√®le spaCy 'en_core_web_sm' pr√™t.[0m
[32m2026-01-17 01:39:51.330[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36mtokenize[0m:[36m64[0m - [1mTokenisation de 25 documents (Batch: 10)[0m
NLP Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:00<00:00, 1289.60doc/s]

Text nettoy√© original: am not feeling good today
Sortie spaCy (Lemmes): ['feel', 'good', 'today']
------
Text nettoy√© original: feel anxious and can't sleep
Sortie spaCy (Lemmes): ['feel', 'anxious', 'sleep']
------
Text nettoy√© original: am feeling very happy and energetic
Sortie spaCy (Lemmes): ['feel', 'happy', 'energetic']
------
Text nettoy√© original: feel very depressed and lonely today
Sortie spaCy (Lemmes): ['feel', 'depressed', 'lonely', 'today']
------
Text nettoy√© original: very anxious and stressed out right now
Sortie spaCy (Lemmes): ['anxious', 'stress', 'right']
------
Text nettoy√© original: sad
Sortie spaCy (Lemmes): ['sad']
------
Text nettoy√© original: check this out
Sortie spaCy (Lemmes): ['check']
------





In [20]:
from loguru import logger
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.application.use_cases.build_dataset import BuildCleanDatasetUseCase

# --- 1. CONFIGURATION ---
label_mapping = {
    "Anxiety": 0, 
    "Normal": 1, 
    "Depression": 2,
}

# --- 2. INSTANCIATION DES COMPOSANTS ---
# Initialisation de l'acc√®s aux donn√©es et du service de nettoyage
repository = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=label_mapping,
)

cleaner = TextCleaner()

# --- 3. ORCHESTRATION ---
# Injection des d√©pendances dans le cas d'usage
use_case = BuildCleanDatasetUseCase(
    repository=repository, 
    cleaner=cleaner
)

# --- 4. EX√âCUTION ---
# Lancement du pipeline et r√©cup√©ration du DatasetDTO
dataset_final = use_case.execute()

# --- 5. Extraction des donn√©es (List comprehension pour plus d'efficacit√©)
texts = [doc.text for doc in dataset_final.documents]
labels = [label for label in dataset_final.labels]

# Cr√©ation du DataFrame
# Utiliser un dictionnaire est la m√©thode la plus propre
dataset = pd.DataFrame({
    'cleaned text': texts,
    'label': labels,
})

# Afficher les premi√®res lignes pour v√©rifier
display(dataset.head(3))

[32m2026-01-17 01:40:48.001[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m47[0m - [1mD√©marrage du pipeline BuildCleanDataset.[0m
[32m2026-01-17 01:40:48.002[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m33[0m - [1mD√©marrage du chargement des donn√©es depuis : ../data/raw/mental_health.csv[0m
[32m2026-01-17 01:40:48.299[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m57[0m - [32m[1mInfrastructure : 1000 paires (Document, Label) cr√©√©es.[0m
[32m2026-01-17 01:40:48.302[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m66[0m - [34m[1mTraitement unitaire de 1000 documents.[0m
[32m2026-01-17 01:40:48.337[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m92[0m - [32m[1mPipeline termin√© : 1000 entit√©s sy

Unnamed: 0,cleaned text,label
0,oh my gosh,0
1,trouble sleeping confused mind restless heart all out of tune,0
2,all wrong back off dear forward doubt stay in restless and restless place,0


# Split Clened Dataset

In [22]:
from mh_nlp.application.use_cases.split_dataset import SplitDatasetUseCase

# 1. Instanciation du Use Case avec vos ratios
# Ici: 70% Train / 10% Val / 20% Test
splitter = SplitDatasetUseCase(test_size=0.2, val_size=0.1)

# 2. Ex√©cution du split
splits = splitter.execute(dataset_final)

# 3. R√©cup√©ration des r√©sultats
train_set = splits["train"]
val_set = splits["val"]
test_set = splits["test"]

print(f"Taille Train : {train_set.total_records}")
print(f"Premier document Train : {train_set.documents[0].text}")
print(f"Premier label Train : {train_set.labels[0]}")

[32m2026-01-17 01:43:44.783[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.split_dataset[0m:[36mexecute[0m:[36m53[0m - [1mD√©coupage du dataset (Total: 1000) | Cible: Test=0.2, Val=0.1[0m
[32m2026-01-17 01:43:44.841[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.application.use_cases.split_dataset[0m:[36mexecute[0m:[36m86[0m - [32m[1mSplit r√©ussi : Train=700 | Val=100 | Test=200[0m


Taille Train : 700
Premier document Train : anxiety fear overthinking at the same time can be called panic attack right
Premier label Train : 0


# Apply Tokenizer fonction

In [24]:
from mh_nlp.domain.entities.document import Document

# Jeu de test minimaliste
raw_texts = [
    "I am feeling very anxious today and I can't sleep.",
    "The therapy sessions are helping me manage my stress better.",
    "I feel much better after talking to my friend."
]

# Conversion en entit√©s du domaine
documents = [Document(text) for text in raw_texts]

# A. Le Tokenizer spaCy (Linguistique)

In [25]:
from mh_nlp.infrastructure.nlp.spacy_tokenizer import SpacyTokenizer

spacy_tok = SpacyTokenizer(model_name="en_core_web_sm", lemmatize=True, batch_size=2)
spacy_output = spacy_tok.tokenize(documents)

print("--- Sortie spaCy (Lemmes) ---")
print(spacy_output[0]) 
# Devrait afficher: ['feel', 'anxious', 'today', 'sleep']

[32m2026-01-17 13:59:13.025[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m40[0m - [1mInitialisation de SpacyTokenizer : en_core_web_sm[0m
[32m2026-01-17 13:59:15.213[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36m__init__[0m:[36m43[0m - [32m[1mMod√®le spaCy 'en_core_web_sm' pr√™t.[0m
[32m2026-01-17 13:59:15.234[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.spacy_tokenizer[0m:[36mtokenize[0m:[36m64[0m - [1mTokenisation de 3 documents (Batch: 2)[0m
NLP Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 28.39doc/s]

--- Sortie spaCy (Lemmes) ---
['feel', 'anxious', 'today', 'sleep']





# B. Le Tokenizer HuggingFace (Deep Learning)

In [26]:
from mh_nlp.infrastructure.nlp.hf_tokenizer import HuggingFaceTokenizer

hf_tok = HuggingFaceTokenizer(model_name="distilbert-base-uncased", max_length=16)
hf_output = hf_tok.tokenize(documents)

print("\n--- Sortie HuggingFace (Tenseurs) ---")
print(hf_output['input_ids'][0]) # Affiche les IDs num√©riques
print(f"Shape: {hf_output['input_ids'].shape}")

[32m2026-01-17 13:59:22.548[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36m__init__[0m:[36m34[0m - [1mChargement du tokenizer HF : distilbert-base-uncased (max_length=16)[0m
[32m2026-01-17 13:59:23.703[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36m__init__[0m:[36m36[0m - [32m[1mTokenizer distilbert-base-uncased charg√© avec succ√®s.[0m
[32m2026-01-17 13:59:23.708[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36mtokenize[0m:[36m63[0m - [34m[1mTokenisation en cours pour 3 documents (Mod√®le: distilbert-base-uncased)[0m
[32m2026-01-17 13:59:23.740[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36mtokenize[0m:[36m76[0m - [34m[1mTokenisation termin√©e. Forme du tenseur : [3, 15][0m



--- Sortie HuggingFace (Tenseurs) ---
tensor([  101,  1045,  2572,  3110,  2200, 11480,  2651,  1998,  1045,  2064,
         1005,  1056,  3637,  1012,   102])
Shape: torch.Size([3, 15])


In [27]:
hf_output['input_ids']

tensor([[  101,  1045,  2572,  3110,  2200, 11480,  2651,  1998,  1045,  2064,
          1005,  1056,  3637,  1012,   102],
        [  101,  1996,  7242,  6521,  2024,  5094,  2033,  6133,  2026,  6911,
          2488,  1012,   102,     0,     0],
        [  101,  1045,  2514,  2172,  2488,  2044,  3331,  2000,  2026,  2767,
          1012,   102,     0,     0,     0]])

# C. Le Tokenizer Keras (CNN/LSTM)

In [16]:
from mh_nlp.infrastructure.nlp.keras_tokenizer import KerasTextTokenizer

keras_tok = KerasTextTokenizer(vocab_size=1000, max_length=16)
keras_tok.fit(documents) # Obligatoire pour Keras
keras_output = keras_tok.tokenize(documents)

print("\n--- Sortie Keras (Numpy Matrix) ---")
print(keras_output[0])
print(f"Shape: {keras_output.shape}")

[32m2026-01-16 11:34:06.066[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.keras_tokenizer[0m:[36m__init__[0m:[36m30[0m - [1mKerasTextTokenizer initialis√© (Vocab target: 1000, Max length: 16)[0m
[32m2026-01-16 11:34:06.079[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.keras_tokenizer[0m:[36mfit[0m:[36m46[0m - [1mApprentissage du vocabulaire sur 3 documents...[0m
[32m2026-01-16 11:34:06.082[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.keras_tokenizer[0m:[36mfit[0m:[36m52[0m - [32m[1mVocabulaire appris : 26 mots uniques trouv√©s.[0m
[32m2026-01-16 11:34:06.093[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.infrastructure.nlp.keras_tokenizer[0m:[36mtokenize[0m:[36m82[0m - [34m[1mS√©quences g√©n√©r√©es : (3, 16) (Type: int32)[0m



--- Sortie Keras (Numpy Matrix) ---
[ 2  5  6  7  8  9 10  2 11 12  0  0  0  0  0  0]
Shape: (3, 16)
