# BERT Finetuning for NER

We adapt our model to only one label

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import random
import numpy as np
import evaluate
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader

## V2 

### Main Functions


In [14]:
def import_conll_data(path:str):
    """
    Import conll_data and convert it to dataset format while preserving full labels.
    """
    with open(path, "r", encoding="utf-8") as file:
        conll_data = file.read()

    sentences = conll_data.strip().split('\n\n')

    # Initialize empty lists to store text and labels
    text_list = []
    labels_list = []

    # Process each sentence
    for sentence in sentences:
        tokens = sentence.split('\n')
        text_tokens = [token.split()[0] for token in tokens]
        label_tokens = []

        for token in tokens:
            token_parts = token.split()
            if len(token_parts) > 4:
                label = " ".join(token_parts[-2:])  # Take the last two elements
            else:
                label = token_parts[-1]  # Take the last element
            label_tokens.append(label)

        text_list.append(text_tokens)
        labels_list.append(label_tokens)

    data = {"text": text_list, "labels": labels_list}

    return data

def turn_to_dataset(data:dict):
    """
    It turn dictionnary of conll into Dataset object of the datasets library from Huggingface
    """
    return Dataset.from_dict(data)

def to_numeric_labels(list_ids, new_id):
    """
    Change the ids of a list of ids to new ids.

    Args:
        list_ids (list): List of ids.
        new_id (dict): Dictionary with the new id mappings.
    """
    return [new_id.get(id) for id in list_ids]

def generate_data_format(data, full_dataframe=False, label_list=None, numeric_labels=None, save_to_csv=False, output_path_csv=None):
    """
    This function generates a DataFrame out of the data dictionary/dataset containing the CoNLL information.

    Args:
        data (dict): Data dictionary containing "text" and "labels" keys.
        full_dataframe (bool): If True, add a "numeric labels" column to the DataFrame.
        label_list (list): List of labels.
        numeric_labels (dict): Dictionary with label to numeric mappings.
        save_to_csv (bool): If True, save the DataFrame to a CSV file.
        output_path_csv (str): Output path for the CSV file.
    
    Returns:
        pd.DataFrame: The generated DataFrame.
        dataset : a dataset object of the generated DataFrame
    """
    df = pd.DataFrame(data)
    
    if full_dataframe and label_list is not None and numeric_labels is not None:
        df["numeric labels"] = df["labels"].apply(lambda labels: to_numeric_labels(labels, numeric_labels))

    dataset = turn_to_dataset(df)
    
    if save_to_csv:
        df.to_csv(output_path_csv, index=False)
    
    return df, dataset

def split_dataset(dataset, test_size=0.2, val_size=0.1, random_seed=None):
    """
    Split a dataset into train, test, and validation subsets.

    Args:
        dataset (datasets.Dataset): The dataset object from Hugging Face.
        test_size (float): The proportion of the dataset to include in the test split (default is 0.2).
        val_size (float): The proportion of the dataset to include in the validation split (default is 0.1).
        random_seed (int): The random seed for shuffling (optional).

    Returns:
        datasets.Dataset: Train, test, and validation datasets.
    """
    if random_seed is not None:
        dataset = dataset.shuffle(seed=random_seed)

    total_size = len(dataset)
    test_size = int(total_size * test_size)
    val_size = int(total_size * val_size)
    train_size = total_size - test_size - val_size

    train_data = dataset[:train_size]
    test_data = dataset[train_size:train_size + test_size]
    val_data = dataset[train_size + test_size:]

    return train_data, test_data, val_data

def tokenize_and_align(dataset, label_all_tokens=False):
    tokenized_inputs = tokenizer(dataset["text"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(dataset["numeric labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def clean_labels(word_label_list):
    cleaned_list = []
    for label in word_label_list:
        # Remove "B-" and "I-" prefixes and check for duplicates
        cleaned_label = label.lstrip("BI-")
        if cleaned_label not in cleaned_list:
            cleaned_list.append(cleaned_label)
    return cleaned_list

def generate_unilabel_list(list_of_labels, target_label):
    short_version = []
    label_mapping = {}
    numeric_value = 0
    
    for label in list_of_labels:
        # Split the label into its components
        parts = label.split()
        
        if len(parts) > 1 and parts[-1] == target_label:
            short_version.append(label)
            label_mapping[label] = numeric_value
            numeric_value += 1
        elif len(label) == 1 and target_label in label:
            short_version.append(label)
            label_mapping[label] = numeric_value
            numeric_value += 1

    return short_version, label_mapping

target_label = "CAD"
short_version, label_mapping = generate_unilabel_list(word_label_list, target_label)

print("Short Version:")
print(short_version)
print("\nLabel Mapping:")
print(label_mapping)


Short Version:
['B-AG CAD', 'I-AG CAD', 'B-AI CAD', 'I-AI CAD', 'B-ATOT CAD', 'I-ATOT CAD']

Label Mapping:
{'B-AG CAD': 0, 'I-AG CAD': 1, 'B-AI CAD': 2, 'I-AI CAD': 3, 'B-ATOT CAD': 4, 'I-ATOT CAD': 5}


In [24]:
def split_labels_by_common_part(word_label_list):
    label_groups = {}
    
    for label in word_label_list:
        parts = label.split("-")
        common_part = parts[-1] if len(parts) > 1 else label

        if common_part not in label_groups:
            label_groups[common_part] = [label]
        else:
            label_groups[common_part].append(label)
    
    return label_groups

word_label_list = [
    "O", "B-SYND", "I-SYND", "B-DIR", "I-DIR", "B-DATE", "I-DATE", "B-ENT", "I-ENT", "B-CAD", "I-CAD", "B-INT", "I-INT", "B-OUV", "I-OUV", "B-NCAD", "I-NCAD", "B-NOUV", "I-NOUV", "B-TOUS", "I-TOUS", 
    "B-AG CAD", "I-AG CAD", "B-AG INT", "I-AG INT", "B-AG OUV", "I-AG OUV", "B-NCAD AG", "I-NCAD AG", "B-NOUV AG", "I-NOUV AG", "B-AI CAD", "I-AI CAD", "B-AI INT", "I-AI INT", "B-AI OUV", "I-AI OUV",
    "B-NCAD AI","I-NCAD AI", "B-NOUV AI", "I-NOUV AI", "B-AG", "I-AG", "B-AI", "I-AI", "B-ATOT", "I-ATOT", "B-ATOT CAD", "I-ATOT CAD", "B-ATOT INT", "I-ATOT INT", "B-ATOT OUV", "I-ATOT OUV", 
    "B-NCAD ATOT","I-NCAD ATOT", "B-NOUV ATOT", "I-NOUV ATOT", "B-PPV", "I-PPV", "B-PPVm", "I-PPVm"
]

label_groups = split_labels_by_common_part(word_label_list)

# Print the label groups
for common_part, labels in label_groups.items():
    print(f"Labels containing '{common_part}': {labels}")

Labels containing 'O': ['O']
Labels containing 'SYND': ['B-SYND', 'I-SYND']
Labels containing 'DIR': ['B-DIR', 'I-DIR']
Labels containing 'DATE': ['B-DATE', 'I-DATE']
Labels containing 'ENT': ['B-ENT', 'I-ENT']
Labels containing 'CAD': ['B-CAD', 'I-CAD']
Labels containing 'INT': ['B-INT', 'I-INT']
Labels containing 'OUV': ['B-OUV', 'I-OUV']
Labels containing 'NCAD': ['B-NCAD', 'I-NCAD']
Labels containing 'NOUV': ['B-NOUV', 'I-NOUV']
Labels containing 'TOUS': ['B-TOUS', 'I-TOUS']
Labels containing 'AG CAD': ['B-AG CAD', 'I-AG CAD']
Labels containing 'AG INT': ['B-AG INT', 'I-AG INT']
Labels containing 'AG OUV': ['B-AG OUV', 'I-AG OUV']
Labels containing 'NCAD AG': ['B-NCAD AG', 'I-NCAD AG']
Labels containing 'NOUV AG': ['B-NOUV AG', 'I-NOUV AG']
Labels containing 'AI CAD': ['B-AI CAD', 'I-AI CAD']
Labels containing 'AI INT': ['B-AI INT', 'I-AI INT']
Labels containing 'AI OUV': ['B-AI OUV', 'I-AI OUV']
Labels containing 'NCAD AI': ['B-NCAD AI', 'I-NCAD AI']
Labels containing 'NOUV AI': [

In [27]:
def split_labels_by_common_part(word_label_list):
    label_groups = {}
    numeric_label_mappings = {}
    
    for label in word_label_list:
        parts = label.split("-")
        common_part = parts[-1] if len(parts) > 1 else label

        if common_part not in label_groups:
            label_groups[common_part] = ["O"]
            numeric_label_mappings[common_part] = {"O": 0}
            label_groups[common_part].append(label)
            numeric_label_mappings[common_part][label] = 1
        else:
            label_groups[common_part].append(label)
            if label not in numeric_label_mappings[common_part]:
                numeric_label_mappings[common_part][label] = len(numeric_label_mappings[common_part])
    
    return label_groups, numeric_label_mappings

word_label_list = [
    "O", "B-SYND", "I-SYND", "B-DIR", "I-DIR", "B-DATE", "I-DATE", "B-ENT", "I-ENT", "B-CAD", "I-CAD", "B-INT", "I-INT", "B-OUV", "I-OUV", "B-NCAD", "I-NCAD", "B-NOUV", "I-NOUV", "B-TOUS", "I-TOUS", 
    "B-AG CAD", "I-AG CAD", "B-AG INT", "I-AG INT", "B-AG OUV", "I-AG OUV", "B-NCAD AG", "I-NCAD AG", "B-NOUV AG", "I-NOUV AG", "B-AI CAD", "I-AI CAD", "B-AI INT", "I-AI INT", "B-AI OUV", "I-AI OUV",
    "B-NCAD AI","I-NCAD AI", "B-NOUV AI", "I-NOUV AI", "B-AG", "I-AG", "B-AI", "I-AI", "B-ATOT", "I-ATOT", "B-ATOT CAD", "I-ATOT CAD", "B-ATOT INT", "I-ATOT INT", "B-ATOT OUV", "I-ATOT OUV", 
    "B-NCAD ATOT","I-NCAD ATOT", "B-NOUV ATOT", "I-NOUV ATOT", "B-PPV", "I-PPV", "B-PPVm", "I-PPVm"
]

label_groups, numeric_label_mappings = split_labels_by_common_part(word_label_list)

# Print the label groups and numeric mappings
for common_part, labels in label_groups.items():
    print(f"Labels containing '{common_part}': {labels}")
    print(f"Numeric Label Mapping for '{common_part}': {numeric_label_mappings[common_part]}")


Labels containing 'O': ['O', 'O']
Numeric Label Mapping for 'O': {'O': 1}
Labels containing 'SYND': ['O', 'B-SYND', 'I-SYND']
Numeric Label Mapping for 'SYND': {'O': 0, 'B-SYND': 1, 'I-SYND': 2}
Labels containing 'DIR': ['O', 'B-DIR', 'I-DIR']
Numeric Label Mapping for 'DIR': {'O': 0, 'B-DIR': 1, 'I-DIR': 2}
Labels containing 'DATE': ['O', 'B-DATE', 'I-DATE']
Numeric Label Mapping for 'DATE': {'O': 0, 'B-DATE': 1, 'I-DATE': 2}
Labels containing 'ENT': ['O', 'B-ENT', 'I-ENT']
Numeric Label Mapping for 'ENT': {'O': 0, 'B-ENT': 1, 'I-ENT': 2}
Labels containing 'CAD': ['O', 'B-CAD', 'I-CAD']
Numeric Label Mapping for 'CAD': {'O': 0, 'B-CAD': 1, 'I-CAD': 2}
Labels containing 'INT': ['O', 'B-INT', 'I-INT']
Numeric Label Mapping for 'INT': {'O': 0, 'B-INT': 1, 'I-INT': 2}
Labels containing 'OUV': ['O', 'B-OUV', 'I-OUV']
Numeric Label Mapping for 'OUV': {'O': 0, 'B-OUV': 1, 'I-OUV': 2}
Labels containing 'NCAD': ['O', 'B-NCAD', 'I-NCAD']
Numeric Label Mapping for 'NCAD': {'O': 0, 'B-NCAD': 1, '

In [18]:
cleaned_labels = ['O', 'SYND', 'DIR', 'DATE', 'ENT', 'CAD', 'NT', 'OUV', 'NCAD', 'NOUV', 'TOUS', 'AG CAD', 'AG INT', 'AG OUV', 'NCAD AG', 'NOUV AG', 'AI CAD', 'AI INT', 'AI OUV',
 'NCAD AI', 'NOUV AI', 'AG', 'AI', 'ATOT', 'ATOT CAD', 'ATOT INT', 'ATOT OUV', 'NCAD ATOT', 'NOUV ATOT', 'PPV', 'PPVm']

### Data pre-treatment

In [3]:
# We load the data
data = import_conll_data(r"../data/raw/data449.conll")

# We turn them into a dataset object (from datasets library of Huggingface)
dataset = turn_to_dataset(data)

# we define a numeric and word list of labels
# word_label_list
non_ordered_word_label_set = set(label for labels in dataset["labels"] for label in labels)
non_ordered_word_label_list = list(non_ordered_word_label_set)
word_label_list = [
    "O", "B-SYND", "I-SYND", "B-DIR", "I-DIR", "B-DATE", "I-DATE", "B-ENT", "I-ENT", "B-CAD", "I-CAD", "B-INT", "I-INT", "B-OUV", "I-OUV", "B-NCAD", "I-NCAD", "B-NOUV", "I-NOUV", "B-TOUS", "I-TOUS", 
    "B-AG CAD", "I-AG CAD", "B-AG INT", "I-AG INT", "B-AG OUV", "I-AG OUV", "B-NCAD AG", "I-NCAD AG", "B-NOUV AG", "I-NOUV AG", "B-AI CAD", "I-AI CAD", "B-AI INT", "I-AI INT", "B-AI OUV", "I-AI OUV",
    "B-NCAD AI","I-NCAD AI", "B-NOUV AI", "I-NOUV AI", "B-AG", "I-AG", "B-AI", "I-AI", "B-ATOT", "I-ATOT", "B-ATOT CAD", "I-ATOT CAD", "B-ATOT INT", "I-ATOT INT", "B-ATOT OUV", "I-ATOT OUV", 
    "B-NCAD ATOT","I-NCAD ATOT", "B-NOUV ATOT", "I-NOUV ATOT", "B-PPV", "I-PPV", "B-PPVm", "I-PPVm"
] # I manually ordered them, note that 'ATOT OUV' level seems to be missing in comparison with the 

# numeric_label_list
label_to_numeric = {
    "O": 0, "B-SYND": 1, "I-SYND": 2, "B-DIR": 3, "I-DIR": 4, "B-DATE": 5, "I-DATE": 6, "B-ENT": 7, "I-ENT": 8, "B-CAD": 9, "I-CAD": 10, "B-INT": 11, "I-INT": 12, "B-OUV": 13, 
    "I-OUV": 14, "B-NCAD": 15, "I-NCAD": 16, "B-NOUV": 17, "I-NOUV": 18, "B-TOUS": 19, "I-TOUS": 20, "B-AG CAD": 21, "I-AG CAD": 22, "B-AG INT": 23, "I-AG INT": 24, "B-AG OUV": 25, "I-AG OUV": 26,
    "B-NCAD AG": 27, "I-NCAD AG": 28, "B-NOUV AG": 29, "I-NOUV AG": 30, "B-AI CAD": 31, "I-AI CAD": 32, "B-AI INT": 33, "I-AI INT": 34, "B-AI OUV": 35, "I-AI OUV": 36, "B-NCAD AI": 37,
    "I-NCAD AI": 38, "B-NOUV AI": 39, "I-NOUV AI": 40, "B-AG": 41, "I-AG": 42, "B-AI": 43, "I-AI": 44, "B-ATOT": 45, "I-ATOT": 46, "B-ATOT CAD": 47, "I-ATOT CAD": 48, "B-ATOT INT": 49,
    "I-ATOT INT": 50, "B-ATOT OUV": 51, "I-ATOT OUV": 52, "B-NCAD ATOT": 53, "I-NCAD ATOT": 54, "B-NOUV ATOT": 55, "I-NOUV ATOT": 56, "B-PPV": 57, "I-PPV": 58, "B-PPVm": 59, "I-PPVm": 60
}

numeric_label_list =[label_to_numeric[label] for label in word_label_list]

### Model Choice

In [4]:
model_checkpoint = "Jean-Baptiste/camembert-ner"
batch_size = 8

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(word_label_list), ignore_mismatched_sizes=True) #this last argument might be a mistake

# import data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='pt')

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([61]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([61, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Final data formating

In [5]:
# We generate the data at the format we are interested in
output_path_csv = r"../data/processed/data449_tokens_labels.csv"
df, dataset = generate_data_format(data, full_dataframe=True, label_list=word_label_list, numeric_labels=label_to_numeric, save_to_csv=True, output_path_csv=output_path_csv)

# We split the data into three subsets for the training
train_data, test_data, val_data = split_dataset(dataset,random_seed=42)

# turn them to dataset object
train_dataset = turn_to_dataset(train_data)
test_dataset = turn_to_dataset(test_data)
val_dataset = turn_to_dataset(val_data)

# We tokenize and align the labels of this sub-datasets
tokenized_train = tokenize_and_align(train_dataset, label_all_tokens=True)
tokenized_test = tokenize_and_align(test_dataset, label_all_tokens=True)
tokenized_val = tokenize_and_align(val_dataset, label_all_tokens=True)

# We turn them back to the dataset format as they were dictionnaries
tokenized_train = turn_to_dataset(tokenized_train)
tokenized_test = turn_to_dataset(tokenized_test)
tokenized_val = turn_to_dataset(tokenized_val)

### Data batching

In [6]:
# Dataloader
train_dataloader = DataLoader(
    tokenized_train,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

test_dataloader = DataLoader(
    tokenized_test,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

val_dataloader = DataLoader(
    tokenized_val,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

### Training Arguments

In [7]:
# training arguments
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"../models/{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

metric = evaluate.load("seqeval")

# We want to do a metrics function to compute the accuracy of the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    print(predictions,labels)
    # Remove ignored index (special tokens)
    true_predictions = [
        [word_label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [word_label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    print(true_predictions,true_labels)
    results = metric.compute(predictions=true_predictions, references=true_labels)
    print(results)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Training

In [8]:
trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.248103,0.0,0.0,0.0,0.931675
2,No log,1.70228,0.0,0.0,0.0,0.931675
3,No log,1.616788,0.0,0.0,0.0,0.931675


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[-100    0    0 ... -100 -100 -100]
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 ...
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

  _warn_prf(average, modifier, msg_start, len(result))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[-100    0    0 ... -100 -100 -100]
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 ...
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

  _warn_prf(average, modifier, msg_start, len(result))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[-100    0    0 ... -100 -100 -100]
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 ...
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

TrainOutput(global_step=120, training_loss=2.3353373209635415, metrics={'train_runtime': 159.0751, 'train_samples_per_second': 5.959, 'train_steps_per_second': 0.754, 'total_flos': 247774055848776.0, 'train_loss': 2.3353373209635415, 'epoch': 3.0})

## V1

### Pretraining

In [None]:
# Load the tokenizer and model
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner") #this last argument might be a mistake

In [None]:
# Check if a GPU is available and set the device accordingly
device = torch.device("cpu")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Empty the GPU cache memory
# torch.cuda.empty_cache()

# This will print the device (either 'cuda' or 'cpu')
# print("Using device:", device)

# You can then move your model and data to this device like this:
# model.to(device)

In [None]:
#Set the number of labels
model.classifier=nn.Linear(in_features=768, out_features=2, bias=True)
model.classifier

In [None]:
def charge_conll(path:str):
    # Initialize a list to store the data
    conll_data = []

    # Open the CoNLL file in read mode
    with open(path, 'r', encoding='utf-8') as file:
        conll_data = file.read()
        # Read each line in the file

    # Split the data into sentences
    sentences = conll_data.strip().split('\n\n')

    # Initialize empty lists to store text and labels
    text_list = []
    labels_list = []

    # Process each sentence
    for sentence in sentences:
        tokens = sentence.split('\n')
        text = " ".join(token.split()[0] for token in tokens)
        labels = " ".join(token.split()[-1] for token in tokens)
        text_list.append(text)
        labels_list.append(labels)

    # Create a dataframe
    df = pd.DataFrame({'text': text_list, 'label': labels_list})

    return df

#we divide the data into train, test, and validation sets

def split_data(data_dict, train_size=0.8, test_size=0.1, val_size=0.1, random_seed=None):
    # Set a random seed for reproducibility
    if random_seed is not None:
        random.seed(random_seed)
        np.random.seed(random_seed)

    # Combine input_ids, attention_mask, and aligned_labels into a single list
    combined_data = list(zip(data_dict['input_ids'], data_dict['attention_mask'], data_dict['labels']))

    # Shuffle the data
    random.shuffle(combined_data)

    # Calculate the sizes of each set
    total_size = len(combined_data)
    train_size = int(train_size * total_size)
    test_size = int(test_size * total_size)
    val_size = int(val_size * total_size)

    # Split the data into train, test, and val sets
    train_data = combined_data[:train_size]
    test_data = combined_data[train_size:train_size + test_size]
    val_data = combined_data[train_size + test_size:train_size + test_size + val_size]

    # Unzip the data to restore the original structure
    train_input_ids, train_attention_mask, train_aligned_labels = zip(*train_data)
    test_input_ids, test_attention_mask, test_aligned_labels = zip(*test_data)
    val_input_ids, val_attention_mask, val_aligned_labels = zip(*val_data)

    # Create dictionaries for the train, test, and val sets
    train_set = {
        'input_ids': list(train_input_ids),
        'attention_mask': list(train_attention_mask),
        'labels': list(train_aligned_labels)
    }
    test_set = {
        'input_ids': list(test_input_ids),
        'attention_mask': list(test_attention_mask),
        'labels': list(test_aligned_labels)
    }
    val_set = {
        'input_ids': list(val_input_ids),
        'attention_mask': list(val_attention_mask),
        'labels': list(val_aligned_labels)
    }

    return train_set, test_set, val_set

def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of words

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of words
    """
    return sentence.split()

def format_text(df):
    formated_text = []

    text = df["text"]
    for i in text:
        i = turn_sentence_to_list(i)
        formated_text.append(i)

    # we add it to the dataframe
    df["formated_text"] = formated_text
    return df

def format_labels(df):
    labels = df["label"].tolist()

    formated_labels = []

    for label in labels:
        # label = list_to_string(label)
        label = turn_sentence_to_list(label)
        formated_labels.append(label)
        # print(label)

    # we add it to the dataframe

    df["formated_labels"] = formated_labels
    return df

def change_ids(list_ids:list, new_id:dict):
    """
    change the ids of a list of ids to a new id

    Args:
        list_ids (list): list of ids
        new_id (dict): dictionary with the new id
    """
    return [new_id.get(id) for id in list_ids]

def align_labels(word_ids:list, tag_list:list):
    """
    Align the labels with the words

    Args:
        word_ids (list): list of ids of the words
        tag_list (list): list of tags
    """
    aligned_labels = []
    for i in word_ids:
        if i is None:
            aligned_labels.append(-100)
        else:
            aligned_labels.append(tag_list[i])
    return aligned_labels

def check_labels(list_labels:list):
    for label in list_labels:
        if type(label)!=int:
            print(label)
            return False
    return True

def count_error(df, column:str):
    """
    Count the number of errors - a cell containing a non numerical value - in a column of a dataframe

    Args:
        dataframe (pd.DataFrame): dataframe to check
        column (str): column to check

    Returns:
        int: number of errors
    """
    return len(df[column])- sum(df[column].apply(lambda x: check_labels(x)))

def create_iids_am(df):
    """
    Create the input ids and attention mask columns
    
    Args:
        df (pd.DataFrame): dataframe containing the data

    Returns:
        pd.DataFrame: dataframe containing the data with the input ids and attention mask columns
    """
    input_ids = []
    attention_mask = []

    for _,i in df.iterrows():
        input = i["ids"]["input_ids"]
        attention = i["ids"]["attention_mask"]

        input_ids.append(input)
        attention_mask.append(attention)

    df["input_ids"] = input_ids
    df["attention_mask"] = attention_mask
    return df

def select_columns(df, columns:list):
    """
    Select columns from a dataframe

    Args:
        df (pd.DataFrame): dataframe
        columns (list): list of columns to select

    """
    df = df[columns].copy()
    return df

def tokenize_and_align_data(path_conll:str, new_id:dict):
    """
    Tokenize the text and align the labels

    Args:
        path_conll (str): path to the conll file
        new_id (dict): dictionary with the new id

    Returns:
        pd.DataFrame: dataframe containing the data
    """
    #charge and create the df
    df = charge_conll(path_conll)
    df = format_text(df)
    df = format_labels(df)
    df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

    #tokenize and align the text
    df["ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True))
    df = create_iids_am(df)
    df["word_ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True).word_ids())
    df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["new_labels"]), axis=1)
    return df

def apply_tokenization(conll_path:str, new_id:dict, columns:list, new_names:dict, save_path=False, output_save_csv=None):
    """
    This function apply the tokenization and alignment to a conll file and select the columns we want to keep

    Args:
        conll_path (str): path to the conll file
        new_id (dict): dictionary with the new id
        columns (list): list of columns to keep
        new_names (dict): dictionary with the new names of the columns
        save_path (bool, optional): if True, save the dataframe in a csv file. Defaults to False.
        output_save_csv ([type], optional): path to the csv file. Defaults to None.

    Returns:
        pd.DataFrame: dataframe containing the data
    """
    df = tokenize_and_align_data(conll_path, new_id=new_id)
    df = select_columns(df, columns)
    df = df.rename(columns=new_names)
    #we check the length of the two columns so that they are of the same dimensions
    print(f"nombre de 'aligned_labels' faux: {count_error(df, 'aligned_labels')}")
    if save_path:
        df.to_csv(output_save_csv, index=False)
    return df

def final_formating(df, start:int, end:int):
    """
    Collect the data from a dataframe in a list for a given range of sentences and store them into a dictionnary

    Args:
        df (pd.DataFrame): dataframe containing the data
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect

    Returns:
        dict: dictionnary containing the data
    """
    input_ids = []
    attention_mask = []
    labels = []

    for i in range(start, end):
        input_ids.append(df["input_ids"][i])
        attention_mask.append(df["attention_mask"][i])
        labels.append(df["aligned_labels"][i])

    data = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

    return data

def download_and_treat_data(conll_path:str, new_id:dict, columns:list, new_names:dict, start:int, end:int, csv=False, output_save=None):
    """
    This function compile all the previous one and treat automatically all data provided as conll format.\n
    The objective is to provide a dictionnary containing the data for a given range of sentences with only one function. \n
    The same results can be obtained by parts with apply_tokenization and final_formating functions. This one is more a convenience function.\n
    It is also possible to obtain a csv file if csv is set to True and a path is provided in output_save.

    Args:
        conll_path (str): path to the conll file
        new_id (dict): dictionnary to change the labels
        columns (list): columns to keep
        new_names (dict): new names of the columns
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect
    
    Returns:
        dict: dictionnary containing the data
    """
    df = apply_tokenization(conll_path, new_id=new_id, columns=columns, new_names=new_names)

    if csv:
        df.to_csv(output_save, index=False)

    data = final_formating(df, start, end)
    return data, df

In [None]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}
columns = ["formated_text", "formated_labels","new_labels","word_ids", "input_ids", "attention_mask", "aligned_labels"]
new_names = {"formated_text": "text", "new_labels": "label", "word_ids": "word_ids", "aligned_labels": "aligned_labels"}
conll_path = r"../data/raw/data449.conll"
start = 0
end = 449
output_save = r"../data/intermediate/data449_token.csv"

data, df_final = download_and_treat_data(conll_path=conll_path, new_id=new_id, columns=columns, new_names=new_names, start=start, end=end, csv=True, output_save=output_save)

train_data, test_data, val_data = split_data(data)

# Create a Dataset object
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
val_dataset = Dataset.from_dict(val_data)

# Create a list of labels
reverse_id = {v: k for k, v in new_id.items()}
second_elements = [value for value in reverse_id.values()]
label_list = second_elements



### Training Arguments

In [None]:
model_checkpoint = "Jean-Baptiste/camembert-ner"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), ignore_mismatched_sizes=True) #this last argument might be a mistake

# import data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='pt')

# Dataloader
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

test_dataloader = DataLoader(
    test_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

val_dataloader = DataLoader(
    val_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
batch_size = 4

# training arguments
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

# metric used for evaluation
metric = evaluate.load("seqeval")

# We want to do a metrics function to compute the accuracy of the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    print(predictions,labels)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    print(true_predictions,true_labels)
    results = metric.compute(predictions=true_predictions, references=true_labels)
    print(results)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Training 

In [None]:
trainer.train()