In [1]:
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from loguru import logger

# 1. Définir le mapping des labels
label_mapping = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Bipolar": 4,
    "Stress": 5,
    "Personality disorder": 6
}

# 2. Charger les données
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=label_mapping
)
data = repo.load()

#data

[32m2026-01-14 16:12:41.919[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m33[0m - [1mDémarrage du chargement des données depuis : ../data/raw/mental_health.csv[0m
[32m2026-01-14 16:12:43.047[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m57[0m - [32m[1mInfrastructure : 53043 paires (Document, Label) créées.[0m


In [None]:
from loguru import logger
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.infrastructure.nlp.tokenizers import SpacyTokenizerAdapter
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.application.use_cases.build_clean_dataset import BuildCleanDatasetUseCase


# 0. Définir le mapping des labels
label_mapping = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Bipolar": 4,
    "Stress": 5,
    "Personality disorder": 6
}

# 1. Setup des composants (Wiring)
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping={"Anxiety": 0, "Normal": 1, "Depression": 2}
)
cleaner = TextCleaner(engine=SpacyTokenizerAdapter())

# 2. Initialisation du Use Case
use_case = BuildCleanDatasetUseCase(repository=repo, cleaner=cleaner)

# 3. Exécution
dataset_final = use_case.execute()

# Maintenant, dataset_final.texts et dataset_final.labels sont prêts pour Sklearn ou PyTorch !

[(Document(text='oh my gosh'), Label(name='Anxiety', index=3)),
 (Document(text='trouble sleeping, confused mind, restless heart. All out of tune'),
  Label(name='Anxiety', index=3)),
 (Document(text='All wrong, back off dear, forward doubt. Stay in a restless and restless place'),
  Label(name='Anxiety', index=3)),
 (Document(text="I've shifted my focus to something else but I'm still worried"),
  Label(name='Anxiety', index=3)),
 (Document(text="I'm restless and restless, it's been a month now, boy. What do you mean?"),
  Label(name='Anxiety', index=3)),
 (Document(text='every break, you must be nervous, like something is wrong, but what the heck'),
  Label(name='Anxiety', index=3)),
 (Document(text='I feel scared, anxious, what can I do? And may my family or us be protected :)'),
  Label(name='Anxiety', index=3)),
 (Document(text="Have you ever felt nervous but didn't know why?"),
  Label(name='Anxiety', index=3)),
 (Document(text="I haven't slept well for 2 days, it's like I'm rest

In [4]:
data.total_read

AttributeError: 'list' object has no attribute 'total_read'

In [None]:
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository

# 1. Définir le mapping des labels (nom -> index numérique)
label_mapping = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Bipolar": 4,
    "Stress": 5,
    "Personality disorder": 6
}

# 2. Charger les données
repo = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=label_mapping
)
raw_data = repo.load()

# 3. Transformer en dataset exploitable
dataset = []
for document, label in raw_data:
    dataset.append({
        'text': document.text,
        'label': label.name,
        'label_id': label.index
    })

# Afficher quelques exemples
print(f"Nombre total d'exemples : {len(dataset)}")
print("\nPremiers exemples :")
for i, item in enumerate(dataset[:3]):
    print(f"\nExemple {i+1}:")
    print(f"  Texte: {item['text'][:100]}...")
    print(f"  Label: {item['label']} (ID: {item['label_id']})")

In [None]:
# Alternative : format simplifié
dataset = [
    (document.text, label.index) 
    for document, label in raw_data
]

# Exemple d'utilisation
texts, labels = zip(*dataset)
print(f"Nombre de textes : {len(texts)}")
print(f"Premier texte : {texts[0]}")
print(f"Premier label : {labels[0]}")

In [None]:
# Séparer textes et labels
texts = [document.text for document, label in raw_data]
labels = [label.index for document, label in raw_data]
label_names = [label.name for document, label in raw_data]

print(f"Nombre d'exemples : {len(texts)}")
print(f"\nExemple 1:")
print(f"  Texte: {texts[0]}")
print(f"  Label: {label_names[0]} ({labels[0]})")

In [None]:
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.infrastructure.nlp.adapters import SpacyAdapter
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.application.use_cases.build_clean_dataset import BuildCleanDatasetUseCase

# 1. Setup des composants (Wiring)
repo = KaggleDatasetRepository(
    csv_path="mental_health.csv", 
    label_mapping={"Anxiety": 0, "Normal": 1, "Depression": 2}
)
cleaner = TextCleaner(engine=SpacyAdapter())

# 2. Initialisation du Use Case
use_case = BuildCleanDatasetUseCase(repository=repo, cleaner=cleaner)

# 3. Exécution
dataset_final = use_case.execute()

# Maintenant, dataset_final.texts et dataset_final.labels sont prêts pour Sklearn ou PyTorch !