# Préparation des Données
### Pour la Détection d'Intention

In [6]:
import yaml
import json

# Charger le fichier NLU YAML
with open("nlu.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

nlu_data = data.get("nlu", [])
converted_data = []

# Parcourir les exemples d'intents
for item in nlu_data:
    intent = item.get("intent")
    examples = item.get("examples", "")

    # Chaque exemple est séparé par un saut de ligne
    for line in examples.strip().split("\n"):
        line = line.strip("- ").strip()
        if line:
            converted_data.append({
                "text": line,
                "intent": intent
            })

# Sauvegarder en JSON
with open("nlu.json", "w", encoding="utf-8") as json_file:
    json.dump(converted_data, json_file, indent=2, ensure_ascii=False)

print("Conversion terminée : nlu.json créé.")


✅ Fichier 'intents_dataset.json' prêt pour entraînement.


In [None]:
import yaml
import json

# Charger le fichier YAML
with open("nlu.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

converted_data = []

# Lire les intents
for item in data.get("nlu", []):
    intent = item.get("intent")
    examples = item.get("examples", "")

    for line in examples.strip().split("\n"):
        line = line.strip("- ").strip()
        if line:
            converted_data.append({
                "text": line,
                "intent": intent
            })

# Enregistrer en JSON
with open("intents_dataset.json", "w", encoding="utf-8") as outfile:
    json.dump(converted_data, outfile, indent=2, ensure_ascii=False)

print(" Fichier 'intents_dataset.json' prêt pour entraînement.")


# Préparation des Données
### Pour l'Extraction d'Entités

In [5]:
import yaml
import json
import re

def extract_entities(example):
    entities = []
    plain_text = ""
    pattern = r"\[(.+?)\]\((.+?)\)"

    last_end = 0
    for match in re.finditer(pattern, example):
        start, end = match.span()
        entity_text, entity_type = match.groups()

        # Append text before entity
        plain_text += example[last_end:start]

        entity_start = len(plain_text)
        plain_text += entity_text
        entity_end = len(plain_text)

        entities.append({
            "entity": entity_type,
            "value": entity_text,
            "start": entity_start,
            "end": entity_end
        })

        last_end = end

    # Append remaining text
    plain_text += example[last_end:]

    return plain_text.strip(), entities

# Charger le fichier YAML
with open("nluEnt.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

converted_data = []

# Parcourir les intents
for item in data.get("nlu", []):
    examples = item.get("examples", "")
    for line in examples.strip().split("\n"):
        line = line.strip("- ").strip()
        if line:
            text, entities = extract_entities(line)
            converted_data.append({
                "text": text,
                "entities": entities
            })

# Sauvegarder en JSON
with open("nlu_entities.json", "w", encoding="utf-8") as json_file:
    json.dump(converted_data, json_file, indent=2, ensure_ascii=False)

print("Conversion terminée : fichier 'nlu_entities.json' créé.")


✅ Fichier 'ner_dataset.json' prêt pour le fine-tuning.


In [None]:
import yaml
import json
import re

def convert_example_to_tokens_and_tags(example):
    pattern = r"\[(.+?)\]\((.+?)\)"
    tokens = []
    tags = []
    last_end = 0

    for match in re.finditer(pattern, example):
        start, end = match.span()
        entity_text, entity_type = match.groups()

        # Text before the entity
        before_entity = example[last_end:start]
        tokens_before = before_entity.strip().split()
        tokens.extend(tokens_before)
        tags.extend(["O"] * len(tokens_before))

        # Entity tokens
        entity_tokens = entity_text.strip().split()
        for i, token in enumerate(entity_tokens):
            tag = f"B-{entity_type}" if i == 0 else f"I-{entity_type}"
            tokens.append(token)
            tags.append(tag)

        last_end = end

    # Remaining text after last entity
    after = example[last_end:]
    tokens_after = after.strip().split()
    tokens.extend(tokens_after)
    tags.extend(["O"] * len(tokens_after))

    return {"tokens": tokens, "ner_tags": tags}

# Charger le fichier YAML
with open("nluEnt.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

converted_data = []

# Parcourir les intents
for item in data.get("nlu", []):
    examples = item.get("examples", "")
    for line in examples.strip().split("\n"):
        line = line.strip("- ").strip()
        if line:
            sample = convert_example_to_tokens_and_tags(line)
            if sample["tokens"]:  # skip empty lines
                converted_data.append(sample)

# Sauvegarder au format JSONL ou JSON
with open("ner_dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(converted_data, json_file, indent=2, ensure_ascii=False)

print("Fichier 'ner_dataset.json' prêt pour le fine-tuning.")
