In [1]:
!pip install datasets transformers


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.11.18-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->da

In [4]:
import os
import random
from collections import defaultdict
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer


In [6]:

############@@ Lecture Corpus ##############@

def parse_conllu(filepath):
    sentences = []
    current = []
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith("#"):
                if current:
                    sentences.append(current)
                    current = []
                continue
            parts = line.split("\t")
            if len(parts) >= 4 and '.' not in parts[0] and '-' not in parts[0]:
                word = parts[1]
                upos = parts[3]
                current.append((word, upos))
    if current:
        sentences.append(current)
    return sentences

######### swap de mots ayant le même tag ################

def build_pos_dict(corpus):
    pos_dict = defaultdict(list)
    for sent in corpus:
        for word, tag in sent:
            if len(word) > 2:
                pos_dict[tag].append(word)
    return pos_dict

def augment_sentence(sentence, pos_dict, p=0.3):
    new_sentence = []
    for word, tag in sentence:
        if tag in pos_dict and random.random() < p:
            substitutes = [w for w in pos_dict[tag] if w != word]
            if substitutes:
                word = random.choice(substitutes)
        new_sentence.append((word, tag))
    return new_sentence

def augment_corpus(corpus, pos_dict, n_augments=2):
    augmented = []
    for sentence in corpus:
        for _ in range(n_augments):
            augmented.append(augment_sentence(sentence, pos_dict))
    return augmented

################ Transformer en Dataset Hugging Face ################

def prepare_dataset(sentences):
    data = {
        "tokens": [[word for word, _ in sent] for sent in sentences],
        "labels": [[tag for _, tag in sent] for sent in sentences]
    }
    return Dataset.from_dict(data)


#############@ Encodage avec Tokenizer ##############

def encode_dataset(dataset, tokenizer, label2id):
    def tokenize_and_align(example):
        tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length")
        labels = []
        for i, word_idx in enumerate(tokenized.word_ids()):
            if word_idx is None:
                labels.append(-100)
            else:
                labels.append(label2id[example["labels"][word_idx]])
        tokenized["labels"] = labels
        return tokenized

    return dataset.map(tokenize_and_align, batched=False)
    

############# Pipeline ##############

def main():
    # Paramètres
    conllu_path = "../data/UD_French-GSD/UD_French-GSD-master/fr_gsd-ud-train.conllu"
    pretrained_model = "camembert-base"

    # Chargement
    base_corpus = parse_conllu(conllu_path)
    pos_dict = build_pos_dict(base_corpus)

    # Augmentation
    augmented = augment_corpus(base_corpus, pos_dict, n_augments=1)
    full_data = base_corpus + augmented
    print(f"Corpus total : {len(full_data)} phrases (avec augmentation)")

    # Créer Dataset Hugging Face
    dataset = prepare_dataset(full_data)

    # Créer les mappings label2id
    unique_labels = sorted({label for ex in dataset["labels"] for label in ex})
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for label, idx in label2id.items()}

    # Tokenizer & encodage
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    encoded = encode_dataset(dataset, tokenizer, label2id)

    # Division train/test
    dataset_dict = DatasetDict({
        "train": encoded.train_test_split(test_size=0.2)["train"],
        "test": encoded.train_test_split(test_size=0.2)["test"]
    })

    print("Dataset prêt pour le fine-tuning.")
    return dataset_dict, label2id, id2label

if __name__ == "__main__":
    dataset_dict, label2id, id2label = main()


FileNotFoundError: [Errno 2] No such file or directory: './data/UD_French-GSD/UD_French-GSD-master/fr_gsd-ud-train.conllu'