In [3]:
import torch
from mh_nlp.application.use_cases.train_model import TrainModelUseCase

# Détection du hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de : {device}")

# On suppose que 'splits' contient déjà {"train": DTO, "val": DTO, "test": DTO} 
# issus de l'étape précédente (SplitDatasetUseCase)

Utilisation de : cpu


In [4]:
from loguru import logger
from mh_nlp.domain.services.text_cleaner import TextCleaner
from mh_nlp.infrastructure.data.kaggle_repository import KaggleDatasetRepository
from mh_nlp.application.use_cases.build_dataset import BuildCleanDatasetUseCase

# 1.
label_mapping = {
    "Anxiety": 0, 
    "Normal": 1, 
    "Depression": 2,
}

# 2. Initialisation de l'accès aux données et du service de nettoyage
repository = KaggleDatasetRepository(
    csv_path="../data/raw/mental_health.csv",
    label_mapping=label_mapping,
)

cleaner = TextCleaner()

# 3. Injection des dépendances dans le cas d'usage
use_case = BuildCleanDatasetUseCase(
    repository=repository, 
    cleaner=cleaner
)

# 4. Lancement du pipeline et récupération du DatasetDTO
dataset_final = use_case.execute()

[32m2026-01-17 15:14:15.591[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m47[0m - [1mDémarrage du pipeline BuildCleanDataset.[0m
[32m2026-01-17 15:14:15.591[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m33[0m - [1mDémarrage du chargement des données depuis : ../data/raw/mental_health.csv[0m
[32m2026-01-17 15:14:16.049[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.data.kaggle_repository[0m:[36mload[0m:[36m57[0m - [32m[1mInfrastructure : 1000 paires (Document, Label) créées.[0m
[32m2026-01-17 15:14:16.054[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m66[0m - [34m[1mTraitement unitaire de 1000 documents.[0m
[32m2026-01-17 15:14:16.066[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.application.use_cases.build_dataset[0m:[36mexecute[0m:[36m92[0m - [32m[1mPipeline terminé : 1000 entités synchroni

In [5]:
from mh_nlp.application.use_cases.split_dataset import SplitDatasetUseCase

# 1. Instanciation du Use Case avec vos ratios
# Ici: 70% Train / 10% Val / 20% Test
splitter = SplitDatasetUseCase(test_size=0.2, val_size=0.1)

# 2. Exécution du split
splits = splitter.execute(dataset_final)

# 3. Récupération des résultats
train_set = splits["train"]
val_set = splits["val"]
test_set = splits["test"]

print(f"Taille Train : {train_set.total_records}")
print(f"Premier document Train : {train_set.documents[0].text}")
print(f"Premier label Train : {train_set.labels[0]}")

[32m2026-01-17 15:14:19.865[0m | [1mINFO    [0m | [36mmh_nlp.application.use_cases.split_dataset[0m:[36mexecute[0m:[36m53[0m - [1mDécoupage du dataset (Total: 1000) | Cible: Test=0.2, Val=0.1[0m
[32m2026-01-17 15:14:19.871[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.application.use_cases.split_dataset[0m:[36mexecute[0m:[36m86[0m - [32m[1mSplit réussi : Train=700 | Val=100 | Test=200[0m


Taille Train : 700
Premier document Train : anxiety fear overthinking at the same time can be called panic attack right
Premier label Train : 0


In [7]:
from mh_nlp.infrastructure.nlp.hf_tokenizer import HuggingFaceTokenizer
from mh_nlp.infrastructure.models.distilbert_classifier import DistilBertClassifier

# 1. Initialisation
tokenizer_db = HuggingFaceTokenizer("distilbert-base-uncased", max_length=64)
model_db = DistilBertClassifier(
    model_name="distilbert-base-uncased", 
    num_labels=len(label_mapping), 
    tokenizer=tokenizer_db, 
    device=device
)

# 2. Exécution (Le Use Case reste agnostique du modèle)
train_use_case = TrainModelUseCase(model_db)
train_use_case.execute(splits["train"], splits["val"])

[32m2026-01-17 15:17:40.480[0m | [1mINFO    [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36m__init__[0m:[36m34[0m - [1mChargement du tokenizer HF : distilbert-base-uncased (max_length=64)[0m
[32m2026-01-17 15:17:41.108[0m | [32m[1mSUCCESS [0m | [36mmh_nlp.infrastructure.nlp.hf_tokenizer[0m:[36m__init__[0m:[36m36[0m - [32m[1mTokenizer distilbert-base-uncased chargé avec succès.[0m
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2026-01-17 15:17:41.370[0m | [34m[1mDEBUG   [0m | [36mmh_nlp.application.use_cases.train_model[0m:[36m__init__[0m:[36m28[0m - [34m[1mTrainModelUseCase initialisé avec DistilBertClassifier[0m
[32m2026-01-17 15:17