# Finetuning of BERT for NER

This notebook contains all the steps for the finetuning of French version of BERT, the objective is to extract specific information from french public business agreements. For this we have a dual approach, first we classify information and then only we use NER property of our model to extract wage variations.

Sources: This notebook is inspired from a masterclass and notebook provided by [Thomas Boehler](https://fr.linkedin.com/in/thomas-boehler-ba34a744) and also the work of my colleagues [Mouad Bernoussi](https://ma.linkedin.com/in/mouad-bernoussi-00aa91242) (see ./notebooks/external) and [Conrad Thiounn](https://github.com/cthiounn).

Try

1. Play with PyTorch
2. Play with the model 
3. Adapt data to BERT format (see Thomas Boehler)
4. Create training

In [172]:
import torch
import torch.nn as nn
import pandas as pd
import json
import sys
sys.path.append("../../src/")
from spacy_utils import import_label_studio_data

## try with conll

In [97]:
def charge_conll(path:str):
    # Initialize a list to store the data
    conll_data = []

    # Open the CoNLL file in read mode
    with open(path, 'r', encoding='utf-8') as file:
        conll_data = file.read()
        # Read each line in the file

    # Split the data into sentences
    sentences = conll_data.strip().split('\n\n')

    # Initialize empty lists to store text and labels
    text_list = []
    labels_list = []

    # Process each sentence
    for sentence in sentences:
        tokens = sentence.split('\n')
        text = " ".join(token.split()[0] for token in tokens)
        labels = " ".join(token.split()[-1] for token in tokens)
        text_list.append(text)
        labels_list.append(labels)

    # Create a dataframe
    df = pd.DataFrame({'text': text_list, 'label': labels_list})

    return df

In [143]:
# Specify the file path to your CoNLL file
conll_file_path = r'../../data/raw/data449.conll'  # Replace with your file path

# Initialize a list to store the data
conll_data = []

# Open the CoNLL file in read mode
with open(conll_file_path, 'r', encoding='utf-8') as file:
    conll_data = file.read()
    # Read each line in the file

# Split the data into sentences
sentences = conll_data.strip().split('\n\n')

# Initialize empty lists to store text and labels
text_list = []
labels_list = []

# Process each sentence
for sentence in sentences:
    tokens = sentence.split('\n')
    text = " ".join(token.split()[0] for token in tokens)
    labels = " ".join(token.split()[-1] for token in tokens)
    text_list.append(text)
    labels_list.append(labels)

# Create a dataframe
df = pd.DataFrame({'text': text_list, 'label': labels_list})

# Print the dataframe
print(df)

                                                  text  \
0    -DOCSTART- evolution des salaires de base : en...   
1    l’enveloppe globale d’augmentation des rémunér...   
2    dispositions au regard de l’implication de tou...   
3    nous travaillons sur une politique de rémunéra...   
4    protocole d’accord négociation annuelle obliga...   
..                                                 ...   
444  negociation annuelle 2022. il a été convenu et...   
445  négociations annuelles obligatoires. ii- dispo...   
446  accord collectif 2022 sur les salaires , la du...   
447  damart sa etablissement. article i : augmentat...   
448  entre l’ues kiabi , représentée par , directeu...   

                                                 label  
0    O O O O O O O O O O O O O O O O O O O O O O O ...  
1    O O O O O O O O O O O O O O O O O O O O O O O ...  
2    O O O O O O O O O O O O O O O O B-SYND O B-DIR...  
3    O O O O O O O O O O O O O O O O O O O O O O O ...  
4    O O O O O O O

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

In [149]:
def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of tokens

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of tokens
    """
    return [token for token in sentence.split(" ")]

# def turn_sentence_to_list(sentence):
#     """
#     Turn a sentence into a list of words

#     Args:
#         sentence (str): sentence to tokenize

#     Returns:
#         list: list of words
#     """
#     return sentence.split()


def list_to_string(input_list, separator=' '):
    """
    Convert a list of strings into a single string with elements separated by a specified separator.

    :param input_list: List of strings to be converted.
    :param separator: The separator to use between elements (default is a space).
    :return: A single string containing the list elements.
    """
    return separator.join(input_list)

def change_ids(list_ids:list, new_id:dict):
    return [new_id.get(id) for id in list_ids]

In [73]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}

df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

In [75]:
print(df["new_labels"][5])
print(df["label"][5])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [96]:
# we check that we haven't left non-numeric labels


def check_labels(list_labels:list):
    for label in list_labels:
        if type(label)!=int:
            print(label)
            return False
    return True

# amount = [df["new_labels"].apply(lambda x: check_labels(x))]

def count_error(dataframe, column:str):
    """
    Count the number of errors - a cell containing a non numerical value - in a column of a dataframe

    Args:
        dataframe (pd.DataFrame): dataframe to check
        column (str): column to check

    Returns:
        int: number of errors
    """
    return len(df[column])- sum(dataframe[column].apply(lambda x: check_labels(x)))

print(f"nombre d'erreurs: {count_error(df, 'new_labels')}")


nombre d'erreurs: 0


In [6]:
texts = df["text"].tolist()
start = 0

for text in texts: 
    text = turn_sentence_to_list(text)
    tokens = tokenizer(text, is_split_into_words=True)
    tokens_mod = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
    length = len(tokens_mod)
    if length > 512:
        start += 1
        print(length)
    else:
        pass

print(start)

AttributeError: 'list' object has no attribute 'split'

JB NER is probably not suited since we lost 182 texts, which is too much, due to context limit. We can eventually try to split the text into 512 tokens chunks. 

In [7]:
import pandas as pd
from collections import defaultdict

# Sample data in CoNLL format
conll_data = r'../../data/raw/data449.conll'

with open(conll_data, 'r', encoding='utf-8') as file:
    conll_data = file.read()

# Split the data into sentences
sentences = conll_data.strip().split('\n\n')

# Initialize empty lists and a variable to keep track of the current sentence length
text_list = []
labels_list = []
current_length = 0

# Define the maximum token length for a chunk
max_chunk_length = 512

# Initialize a dictionary to keep track of chunks
chunked_data = defaultdict(list)

# Process each sentence
for sentence in sentences:
    tokens = sentence.split('\n')
    text = " ".join(token.split()[0] for token in tokens)
    labels = " ".join(token.split()[-1] for token in tokens)
    
    # Check if adding this sentence will exceed the maximum chunk length
    if current_length + len(tokens) <= max_chunk_length:
        # Add this sentence to the current chunk
        text_list.append(text)
        labels_list.append(labels)
        current_length += len(tokens)
    else:
        # Start a new chunk
        chunked_data['text'].append(text_list)
        chunked_data['label'].append(labels_list)
        
        # Reset the current chunk
        text_list = [text]
        labels_list = [labels]
        current_length = len(tokens)

# Add the last chunk
chunked_data['text'].append(text_list)
chunked_data['label'].append(labels_list)

# Create a dataframe from the chunked data
df = pd.DataFrame(chunked_data)

# Print the dataframe
print(df)


                                                  text  \
0    [-DOCSTART- evolution des salaires de base : e...   
1    [l’enveloppe globale d’augmentation des rémuné...   
2    [dispositions au regard de l’implication de to...   
3    [nous travaillons sur une politique de rémunér...   
4    [protocole d’accord négociation annuelle oblig...   
..                                                 ...   
404  [negociation annuelle 2022. article 1 – mesure...   
405  [négociations annuelles obligatoires. ii- disp...   
406  [accord collectif 2022 sur les salaires , la d...   
407  [damart sa etablissement. article i : augmenta...   
408  [entre l’ues kiabi , représentée par , directe...   

                                                 label  
0    [O O O O O O O O O O O O O O O O O O O O O O O...  
1    [O O O O O O O O O O O O O O O O O O O O O O O...  
2    [O O O O O O O O O O O O O O O O B-SYND O B-DI...  
3    [O O O O O O O O O O O O O O O O O O O O O O O...  
4    [O O O O O O 

In [155]:
# Example usage:
def format_text(df):
    formated_text = []

    text = df["text"]
    for i in text:
        i = turn_sentence_to_list(i)
        formated_text.append(i)

    # we add it to the dataframe
    df["formated_text"] = formated_text
    return df

formated_text = []

# text = df["text"]
# for i in text:
#     i = turn_sentence_to_list(i)
#     formated_text.append(i)

# # we add it to the dataframe
# df["formated_text"] = formated_text
# df = format_text(df)


now we format the label to the right format

In [156]:
def format_labels(df):
    labels = df["label"].tolist()

    formated_labels = []

    for label in labels:
        # label = list_to_string(label)
        label = turn_sentence_to_list(label)
        formated_labels.append(label)
        # print(label)

    # we add it to the dataframe

    df["formated_labels"] = formated_labels
    return df

In [178]:
def select_columns(df, columns:list):
    df = df[columns].copy()
    return df

df = select_columns(df, columns=["formated_text", "formated_labels"])

# df = df.rename(columns={"formated_text": "text", "formated_labels": "label"})



In [11]:
# We check the length of the two columns so that they are of the same dimensions

start = 0

for i in range(len(df["text"])):
    if len(df['text'][i]) != len(df['label'][i]):
        start += 1
    else:
        pass

print("les éléments dont la dimension diverge sont au nombre de",start)

les éléments dont la dimension diverge sont au nombre de 0


In [172]:
df.head()

Unnamed: 0,text,label
0,"[-DOCSTART-, evolution, des, salaires, de, bas...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[dispositions, au, regard, de, l’implication, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[nous, travaillons, sur, une, politique, de, r...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[protocole, d’accord, négociation, annuelle, o...","[O, O, O, O, O, O, O, O, O, B-DIR, O, O, O, O,..."


We start tokenizing the text

In [18]:
type(df["text"][0][0])

str

In [19]:
tokenized_input = tokenizer(df["text"][0], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokenized_input)

{'input_ids': [5, 67, 342, 5363, 7486, 11485, 26, 599, 14573, 20, 8058, 8, 494, 43, 10182, 11558, 43, 51, 30, 13646, 128, 19, 1503, 27, 17, 12, 16886, 11558, 5670, 15, 17, 12, 2010, 20, 8058, 8, 494, 20, 4891, 7826, 68, 3306, 25, 1337, 21, 7, 14, 22745, 37, 1044, 19, 214, 643, 10121, 10, 18, 12, 28806, 15, 28, 3708, 21, 7, 5826, 55, 205, 7, 929, 453, 8, 13, 2269, 16054, 9, 8, 82, 21, 7, 19, 17218, 16850, 10, 22, 8001, 25, 4360, 8, 494, 3138, 263, 163, 22, 93, 125, 27, 16, 4360, 20, 4891, 5552, 9, 24, 3232, 21, 7, 4953, 15, 13, 462, 16054, 18, 12, 35, 1187, 17, 5337, 537, 642, 35, 21, 7, 16, 20692, 297, 759, 28, 1141, 1627, 15, 17, 12, 2010, 25, 4360, 8, 494, 8, 58, 18010, 29, 58, 6230, 21, 7, 22, 3184, 22, 8590, 43, 77, 2283, 32, 17, 12, 520, 77, 2283, 29, 13, 1083, 1363, 13, 2283, 20, 4412, 8733, 36, 93, 359, 8, 1337, 16, 459, 128, 77, 1266, 29, 17, 12, 4515, 8, 4360, 14, 77, 2283, 29, 13, 1083, 38, 951, 169, 350, 8, 325, 3009, 21, 7, 23, 18010, 45, 7826, 34, 68, 3306, 38, 165, 16918,

In [21]:
token_input = tokenizer(df['text'][0], is_split_into_words=True)
print(token_input)

tokens = tokenizer.convert_ids_to_tokens(token_input["input_ids"])
print(tokens)

{'input_ids': [5, 67, 342, 5363, 7486, 11485, 26, 599, 14573, 20, 8058, 8, 494, 43, 10182, 11558, 43, 51, 30, 13646, 128, 19, 1503, 27, 17, 12, 16886, 11558, 5670, 15, 17, 12, 2010, 20, 8058, 8, 494, 20, 4891, 7826, 68, 3306, 25, 1337, 21, 7, 14, 22745, 37, 1044, 19, 214, 643, 10121, 10, 18, 12, 28806, 15, 28, 3708, 21, 7, 5826, 55, 205, 7, 929, 453, 8, 13, 2269, 16054, 9, 8, 82, 21, 7, 19, 17218, 16850, 10, 22, 8001, 25, 4360, 8, 494, 3138, 263, 163, 22, 93, 125, 27, 16, 4360, 20, 4891, 5552, 9, 24, 3232, 21, 7, 4953, 15, 13, 462, 16054, 18, 12, 35, 1187, 17, 5337, 537, 642, 35, 21, 7, 16, 20692, 297, 759, 28, 1141, 1627, 15, 17, 12, 2010, 25, 4360, 8, 494, 8, 58, 18010, 29, 58, 6230, 21, 7, 22, 3184, 22, 8590, 43, 77, 2283, 32, 17, 12, 520, 77, 2283, 29, 13, 1083, 1363, 13, 2283, 20, 4412, 8733, 36, 93, 359, 8, 1337, 16, 459, 128, 77, 1266, 29, 17, 12, 4515, 8, 4360, 14, 77, 2283, 29, 13, 1083, 38, 951, 169, 350, 8, 325, 3009, 21, 7, 23, 18010, 45, 7826, 34, 68, 3306, 38, 165, 16918,

In [215]:
word_ids = token_input.word_ids()
print(word_ids)

def align_labels(word_ids:list, tag_list:list):
    aligned_labels = []
    for i in word_ids:
        if i is None:
            aligned_labels.append(-100)
        else:
            aligned_labels.append(tag_list[i])
    return aligned_labels

aligned_labels = align_labels(word_ids, tag_list=df["new_labels"][0])
# aligned_labels = create_aligned_labels(word_ids, example=df)

print(aligned_labels)
print(len(aligned_labels))
print(len(word_ids))

[None, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 17, 18, 19, 20, 21, 21, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 42, 42, 42, 43, 44, 45, 46, 46, 47, 47, 48, 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 56, 57, 58, 59, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 79, 80, 81, 82, 83, 84, 85, 85, 85, 85, 86, 86, 87, 88, 88, 89, 89, 90, 91, 91, 92, 93, 94, 95, 96, 97, 97, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 117, 117, 118, 119, 120, 121, 122, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 139, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 149, 150, 151, 152, 152, 153, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 163, 164, 165, 166, 167, 168, 168, 168, 169, 170, 171, 172, 173, 174, 174, 174, 175, 176, 177, 178, 179, 180, 18

In [191]:
df["word_ids"] = df["text"].apply(lambda x: tokenizer(x, is_split_into_words=True).word_ids())
df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["label"]), axis=1)

In [46]:
list(zip(tokens, aligned_labels))

[('<s>', -100),
 ('▁-', 'O'),
 ('D', 'O'),
 ('OC', 'O'),
 ('ST', 'O'),
 ('ART', 'O'),
 ('-', 'O'),
 ('▁e', 'O'),
 ('volution', 'O'),
 ('▁des', 'O'),
 ('▁salaires', 'O'),
 ('▁de', 'O'),
 ('▁base', 'O'),
 ('▁:', 'O'),
 ('▁enveloppe', 'O'),
 ('▁budgétaire', 'O'),
 ('▁:', 'O'),
 ('▁il', 'O'),
 ('▁est', 'O'),
 ('▁convenu', 'O'),
 ('▁entre', 'O'),
 ('▁les', 'O'),
 ('▁parties', 'O'),
 ('▁que', 'O'),
 ('▁l', 'O'),
 ('’', 'O'),
 ('enveloppe', 'O'),
 ('▁budgétaire', 'O'),
 ('▁consacrée', 'O'),
 ('▁à', 'O'),
 ('▁l', 'O'),
 ('’', 'O'),
 ('évolution', 'O'),
 ('▁des', 'O'),
 ('▁salaires', 'O'),
 ('▁de', 'O'),
 ('▁base', 'O'),
 ('▁des', 'B-TOUS'),
 ('▁collaborateurs', 'I-TOUS'),
 ('▁répondant', 'O'),
 ('▁aux', 'O'),
 ('▁attentes', 'O'),
 ('▁du', 'O'),
 ('▁poste', 'O'),
 ('▁', 'O'),
 (',', 'O'),
 ('▁et', 'O'),
 ('▁remplissant', 'O'),
 ('▁par', 'O'),
 ('▁ailleurs', 'O'),
 ('▁les', 'O'),
 ('▁autres', 'O'),
 ('▁conditions', 'O'),
 ('▁habituelle', 'O'),
 ('s', 'O'),
 ('▁d', 'O'),
 ('’', 'O'),
 ('éligibili

In [223]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}
columns = ["formated_text", "new_labels", "word_ids", "aligned_labels"]
new_names = {"formated_text": "text", "new_labels": "label", "word_ids": "word_ids", "aligned_labels": "aligned_labels"}

def tokenize_and_align_data(path_conll:str, new_id:dict):
    #charge and create the df
    df = charge_conll(path_conll)
    df = format_text(df)
    df = format_labels(df)
    df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

    #tokenize and align the text
    df["word_ids"] = df["formated_text"].apply(lambda x: tokenizer(x, is_split_into_words=True).word_ids())
    df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["new_labels"]), axis=1)

    return df

df = tokenize_and_align_data(r"..\..\data\raw\data449.conll", new_id=new_id)
# df = select_and_rename_data(df, selected_columns=["formated_text", "new_labels", "word_ids", "aligned_labels"], new_names=new_names)
df = select_columns(df, columns)
df = df.rename(columns=new_names)
#we check the length of the two columns so that they are of the same dimensions
print(f"nombre de 'aligned_labels' faux: {count_error(df, 'aligned_labels')}")
df

nombre de 'aligned_labels' faux: 0


Unnamed: 0,text,label,word_ids,aligned_labels
0,"[-DOCSTART-, evolution, des, salaires, de, bas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[dispositions, au, regard, de, l’implication, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 3, 4, 4, 4, 5, 6, 7, 8, 9, 10,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[nous, travaillons, sur, une, politique, de, r...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[protocole, d’accord, négociation, annuelle, o...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, ...","[None, 0, 1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ..."
...,...,...,...,...
444,"[negociation, annuelle, 2022., il, a, été, con...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 7, 8, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
445,"[négociations, annuelles, obligatoires., ii-, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 2, 3, 3, 3, 4, 5, 5, 6, 7, 8, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
446,"[accord, collectif, 2022, sur, les, salaires, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
447,"[damart, sa, etablissement., article, i, :, au...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 1, 2, 2, 2, 3, 4, 5, 6, 6, 7, 7, ...","[-100, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [207]:
print(df["aligned_labels"][0])
df_test = df["aligned_labels"].apply(lambda x :change_ids(x, new_id=new_id))
print(df_test[0])

[-100, 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TOUS', 'I-TOUS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ATOT', 'B-ATOT', 'B-ATOT', 'I-ATOT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ENT', 'B-ENT', 'B-ENT', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [30]:
example = df.iloc[0]

In [34]:
tokenized_input = tokenizer(example["text"], is_split_into_words=True)
aligned_labels = [-100 if i is None else example[f"label"][i] for i in word_ids]
print(example["label"])
print(len(word_ids))
print(len(aligned_labels))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TOUS', 'I-TOUS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ATOT', 'I-ATOT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [45]:
list(zip(tokens_mod, aligned_labels))

NameError: name 'tokens_mod' is not defined

## others

Load a dataset for training from Huggingface

In [246]:
# If you like it you can download it
data = import_label_studio_data("../../data/raw/data449.json")

df = pd.DataFrame(data, columns = ['text', 'label'])

In [None]:
df.head()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

In [243]:
def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of tokens

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of tokens
    """
    return [token for token in sentence.split(" ")]

In [247]:
for i in range(len(df["text"])):
    df["text"][i] = turn_sentence_to_list(df["text"][i])

df.head()

Unnamed: 0,text,label
0,"[evolution, des, salaires, de, base, :, envelo...","{'entities': [(322, 326, 'ATOT'), (161, 179, '..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","{'entities': [(229, 237, 'OUV'), (239, 247, 'O..."
2,"[dispositions, au, regard, de, l’implication, ...","{'entities': [(101, 105, 'SYND'), (110, 122, '..."
3,"[nous, travaillons, sur, une, politique, de, r...","{'entities': [(165, 172, 'SYND'), (364, 371, '..."
4,"[protocole, d’accord, négociation, annuelle, o...","{'entities': [(73, 82, 'DIR'), (108, 119, 'ENT..."


In [None]:
length = []

for i in range(len(df["text"])):
    element = df["text"][i]
    length.append(len(element))
    # print(len(df["text"][i])) 

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors

plt.hist(length, bins=20, color = "lightgreen", edgecolor='black') 

# Add labels and title
plt.xlabel('Longeur (élément)')
plt.ylabel('Fréquence')
plt.title('Histogram de la longueurs des phrases en élément')

# Show the histogram
plt.show()


In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'text': ["evolution des salaires de base : enveloppe"],
    'label': [{'entities': [(22, 31, 'ATOT')]}]
}

df = pd.DataFrame(data)

# Function to tokenize text while preserving whitespace
def tokenize_text(row):
    text = row['text']
    tokens = []
    start = 0

    for start, end, entity in row['label']['entities']:
        # Add non-entity text
        tokens.extend(text[start:end].split())
        start = end

    # Add any remaining text after the last entity
    tokens.extend(text[start:].split())
    
    return tokens

# Apply the function to create a new column
df['tokenized_text'] = df.apply(tokenize_text, axis=1)

# Display the DataFrame with tokenized text while preserving whitespace
print(df)


In [None]:
for i in range(len(df["label"])):
    print(df["label"][i]["entities"])
    


In [None]:
text = df['text'][0]
text = turn_sentence_to_list(text)
# print(type(text))
tokens = tokenizer(text, is_split_into_words=True)
# print(tokens)

tokens_mod = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens_mod)


In [None]:
word_ids = tokens.word_ids()
# aligned_labels = [-100 if i is None else text["label"][i] for i in word_ids]
for i in word_ids:
    # print(i)
    if i is None:
        print(i)
    else:
        print(text["label"][i])

In [None]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)