# Finetuning of BERT for NER

This notebook contains all the steps for the finetuning of French version of BERT, the objective is to extract specific information from french public business agreements. For this we have a dual approach, first we classify information and then only we use NER property of our model to extract wage variations.

Sources: This notebook is inspired from a masterclass and notebook provided by [Thomas Boehler](https://fr.linkedin.com/in/thomas-boehler-ba34a744) and also the work of my colleagues [Mouad Bernoussi](https://ma.linkedin.com/in/mouad-bernoussi-00aa91242) (see ./notebooks/external) and [Conrad Thiounn](https://github.com/cthiounn).

Try

1. Play with PyTorch
2. Play with the model 
3. Adapt data to BERT format (see Thomas Boehler)
4. Create training

In [119]:
import torch
import torch.nn as nn
import pandas as pd
import random
import numpy as np

## try with conll

In [3]:
def charge_conll(path:str):
    # Initialize a list to store the data
    conll_data = []

    # Open the CoNLL file in read mode
    with open(path, 'r', encoding='utf-8') as file:
        conll_data = file.read()
        # Read each line in the file

    # Split the data into sentences
    sentences = conll_data.strip().split('\n\n')

    # Initialize empty lists to store text and labels
    text_list = []
    labels_list = []

    # Process each sentence
    for sentence in sentences:
        tokens = sentence.split('\n')
        text = " ".join(token.split()[0] for token in tokens)
        labels = " ".join(token.split()[-1] for token in tokens)
        text_list.append(text)
        labels_list.append(labels)

    # Create a dataframe
    df = pd.DataFrame({'text': text_list, 'label': labels_list})

    return df

In [143]:
# Specify the file path to your CoNLL file
conll_file_path = r'../../data/raw/data449.conll'  # Replace with your file path

# Initialize a list to store the data
conll_data = []

# Open the CoNLL file in read mode
with open(conll_file_path, 'r', encoding='utf-8') as file:
    conll_data = file.read()
    # Read each line in the file

# Split the data into sentences
sentences = conll_data.strip().split('\n\n')

# Initialize empty lists to store text and labels
text_list = []
labels_list = []

# Process each sentence
for sentence in sentences:
    tokens = sentence.split('\n')
    text = " ".join(token.split()[0] for token in tokens)
    labels = " ".join(token.split()[-1] for token in tokens)
    text_list.append(text)
    labels_list.append(labels)

# Create a dataframe
df = pd.DataFrame({'text': text_list, 'label': labels_list})

# Print the dataframe
print(df)

                                                  text  \
0    -DOCSTART- evolution des salaires de base : en...   
1    l’enveloppe globale d’augmentation des rémunér...   
2    dispositions au regard de l’implication de tou...   
3    nous travaillons sur une politique de rémunéra...   
4    protocole d’accord négociation annuelle obliga...   
..                                                 ...   
444  negociation annuelle 2022. il a été convenu et...   
445  négociations annuelles obligatoires. ii- dispo...   
446  accord collectif 2022 sur les salaires , la du...   
447  damart sa etablissement. article i : augmentat...   
448  entre l’ues kiabi , représentée par , directeu...   

                                                 label  
0    O O O O O O O O O O O O O O O O O O O O O O O ...  
1    O O O O O O O O O O O O O O O O O O O O O O O ...  
2    O O O O O O O O O O O O O O O O B-SYND O B-DIR...  
3    O O O O O O O O O O O O O O O O O O O O O O O ...  
4    O O O O O O O

In [4]:
def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of tokens

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of tokens
    """
    return [token for token in sentence.split(" ")]

# def turn_sentence_to_list(sentence):
#     """
#     Turn a sentence into a list of words

#     Args:
#         sentence (str): sentence to tokenize

#     Returns:
#         list: list of words
#     """
#     return sentence.split()


def list_to_string(input_list, separator=' '):
    """
    Convert a list of strings into a single string with elements separated by a specified separator.

    :param input_list: List of strings to be converted.
    :param separator: The separator to use between elements (default is a space).
    :return: A single string containing the list elements.
    """
    return separator.join(input_list)

def change_ids(list_ids:list, new_id:dict):
    return [new_id.get(id) for id in list_ids]

In [73]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}

df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

In [75]:
print(df["new_labels"][5])
print(df["label"][5])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [6]:
# we check that we haven't left non-numeric labels


def check_labels(list_labels:list):
    for label in list_labels:
        if type(label)!=int:
            print(label)
            return False
    return True

# amount = [df["new_labels"].apply(lambda x: check_labels(x))]

def count_error(dataframe, column:str):
    """
    Count the number of errors - a cell containing a non numerical value - in a column of a dataframe

    Args:
        dataframe (pd.DataFrame): dataframe to check
        column (str): column to check

    Returns:
        int: number of errors
    """
    return len(df[column])- sum(dataframe[column].apply(lambda x: check_labels(x)))

# print(f"nombre d'erreurs: {count_error(df, 'new_labels')}")


In [6]:
texts = df["text"].tolist()
start = 0

for text in texts: 
    text = turn_sentence_to_list(text)
    tokens = tokenizer(text, is_split_into_words=True)
    tokens_mod = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
    length = len(tokens_mod)
    if length > 512:
        start += 1
        print(length)
    else:
        pass

print(start)

AttributeError: 'list' object has no attribute 'split'

JB NER is probably not suited since we lost 182 texts, which is too much, due to context limit. We can eventually try to split the text into 512 tokens chunks. 

In [7]:
import pandas as pd
from collections import defaultdict

# Sample data in CoNLL format
conll_data = r'../../data/raw/data449.conll'

with open(conll_data, 'r', encoding='utf-8') as file:
    conll_data = file.read()

# Split the data into sentences
sentences = conll_data.strip().split('\n\n')

# Initialize empty lists and a variable to keep track of the current sentence length
text_list = []
labels_list = []
current_length = 0

# Define the maximum token length for a chunk
max_chunk_length = 512

# Initialize a dictionary to keep track of chunks
chunked_data = defaultdict(list)

# Process each sentence
for sentence in sentences:
    tokens = sentence.split('\n')
    text = " ".join(token.split()[0] for token in tokens)
    labels = " ".join(token.split()[-1] for token in tokens)
    
    # Check if adding this sentence will exceed the maximum chunk length
    if current_length + len(tokens) <= max_chunk_length:
        # Add this sentence to the current chunk
        text_list.append(text)
        labels_list.append(labels)
        current_length += len(tokens)
    else:
        # Start a new chunk
        chunked_data['text'].append(text_list)
        chunked_data['label'].append(labels_list)
        
        # Reset the current chunk
        text_list = [text]
        labels_list = [labels]
        current_length = len(tokens)

# Add the last chunk
chunked_data['text'].append(text_list)
chunked_data['label'].append(labels_list)

# Create a dataframe from the chunked data
df = pd.DataFrame(chunked_data)

# Print the dataframe
print(df)


                                                  text  \
0    [-DOCSTART- evolution des salaires de base : e...   
1    [l’enveloppe globale d’augmentation des rémuné...   
2    [dispositions au regard de l’implication de to...   
3    [nous travaillons sur une politique de rémunér...   
4    [protocole d’accord négociation annuelle oblig...   
..                                                 ...   
404  [negociation annuelle 2022. article 1 – mesure...   
405  [négociations annuelles obligatoires. ii- disp...   
406  [accord collectif 2022 sur les salaires , la d...   
407  [damart sa etablissement. article i : augmenta...   
408  [entre l’ues kiabi , représentée par , directe...   

                                                 label  
0    [O O O O O O O O O O O O O O O O O O O O O O O...  
1    [O O O O O O O O O O O O O O O O O O O O O O O...  
2    [O O O O O O O O O O O O O O O O B-SYND O B-DI...  
3    [O O O O O O O O O O O O O O O O O O O O O O O...  
4    [O O O O O O 

In [7]:
# Example usage:
def format_text(df):
    formated_text = []

    text = df["text"]
    for i in text:
        i = turn_sentence_to_list(i)
        formated_text.append(i)

    # we add it to the dataframe
    df["formated_text"] = formated_text
    return df

# formated_text = []

# text = df["text"]
# for i in text:
#     i = turn_sentence_to_list(i)
#     formated_text.append(i)

# # we add it to the dataframe
# df["formated_text"] = formated_text
# df = format_text(df)


now we format the label to the right format

In [8]:
def format_labels(df):
    labels = df["label"].tolist()

    formated_labels = []

    for label in labels:
        # label = list_to_string(label)
        label = turn_sentence_to_list(label)
        formated_labels.append(label)
        # print(label)

    # we add it to the dataframe

    df["formated_labels"] = formated_labels
    return df

In [10]:
def select_columns(df, columns:list):
    df = df[columns].copy()
    return df

# df = select_columns(df, columns=["formated_text", "formated_labels"])

# df = df.rename(columns={"formated_text": "text", "formated_labels": "label"})



In [11]:
# We check the length of the two columns so that they are of the same dimensions

start = 0

for i in range(len(df["text"])):
    if len(df['text'][i]) != len(df['label'][i]):
        start += 1
    else:
        pass

print("les éléments dont la dimension diverge sont au nombre de",start)

les éléments dont la dimension diverge sont au nombre de 0


In [172]:
df.head()

Unnamed: 0,text,label
0,"[-DOCSTART-, evolution, des, salaires, de, bas...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[dispositions, au, regard, de, l’implication, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[nous, travaillons, sur, une, politique, de, r...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[protocole, d’accord, négociation, annuelle, o...","[O, O, O, O, O, O, O, O, O, B-DIR, O, O, O, O,..."


We start tokenizing the text

In [18]:
type(df["text"][0][0])

str

In [19]:
tokenized_input = tokenizer(df["text"][0], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokenized_input)

{'input_ids': [5, 67, 342, 5363, 7486, 11485, 26, 599, 14573, 20, 8058, 8, 494, 43, 10182, 11558, 43, 51, 30, 13646, 128, 19, 1503, 27, 17, 12, 16886, 11558, 5670, 15, 17, 12, 2010, 20, 8058, 8, 494, 20, 4891, 7826, 68, 3306, 25, 1337, 21, 7, 14, 22745, 37, 1044, 19, 214, 643, 10121, 10, 18, 12, 28806, 15, 28, 3708, 21, 7, 5826, 55, 205, 7, 929, 453, 8, 13, 2269, 16054, 9, 8, 82, 21, 7, 19, 17218, 16850, 10, 22, 8001, 25, 4360, 8, 494, 3138, 263, 163, 22, 93, 125, 27, 16, 4360, 20, 4891, 5552, 9, 24, 3232, 21, 7, 4953, 15, 13, 462, 16054, 18, 12, 35, 1187, 17, 5337, 537, 642, 35, 21, 7, 16, 20692, 297, 759, 28, 1141, 1627, 15, 17, 12, 2010, 25, 4360, 8, 494, 8, 58, 18010, 29, 58, 6230, 21, 7, 22, 3184, 22, 8590, 43, 77, 2283, 32, 17, 12, 520, 77, 2283, 29, 13, 1083, 1363, 13, 2283, 20, 4412, 8733, 36, 93, 359, 8, 1337, 16, 459, 128, 77, 1266, 29, 17, 12, 4515, 8, 4360, 14, 77, 2283, 29, 13, 1083, 38, 951, 169, 350, 8, 325, 3009, 21, 7, 23, 18010, 45, 7826, 34, 68, 3306, 38, 165, 16918,

In [21]:
token_input = tokenizer(df['text'][0], is_split_into_words=True)
print(token_input)

# tokens = tokenizer.convert_ids_to_tokens(token_input["input_ids"])
# print(tokens)

{'input_ids': [5, 67, 342, 5363, 7486, 11485, 26, 599, 14573, 20, 8058, 8, 494, 43, 10182, 11558, 43, 51, 30, 13646, 128, 19, 1503, 27, 17, 12, 16886, 11558, 5670, 15, 17, 12, 2010, 20, 8058, 8, 494, 20, 4891, 7826, 68, 3306, 25, 1337, 21, 7, 14, 22745, 37, 1044, 19, 214, 643, 10121, 10, 18, 12, 28806, 15, 28, 3708, 21, 7, 5826, 55, 205, 7, 929, 453, 8, 13, 2269, 16054, 9, 8, 82, 21, 7, 19, 17218, 16850, 10, 22, 8001, 25, 4360, 8, 494, 3138, 263, 163, 22, 93, 125, 27, 16, 4360, 20, 4891, 5552, 9, 24, 3232, 21, 7, 4953, 15, 13, 462, 16054, 18, 12, 35, 1187, 17, 5337, 537, 642, 35, 21, 7, 16, 20692, 297, 759, 28, 1141, 1627, 15, 17, 12, 2010, 25, 4360, 8, 494, 8, 58, 18010, 29, 58, 6230, 21, 7, 22, 3184, 22, 8590, 43, 77, 2283, 32, 17, 12, 520, 77, 2283, 29, 13, 1083, 1363, 13, 2283, 20, 4412, 8733, 36, 93, 359, 8, 1337, 16, 459, 128, 77, 1266, 29, 17, 12, 4515, 8, 4360, 14, 77, 2283, 29, 13, 1083, 38, 951, 169, 350, 8, 325, 3009, 21, 7, 23, 18010, 45, 7826, 34, 68, 3306, 38, 165, 16918,

In [16]:
# word_ids = token_input.word_ids()
# print(word_ids)

def align_labels(word_ids:list, tag_list:list):
    aligned_labels = []
    for i in word_ids:
        if i is None:
            aligned_labels.append(-100)
        else:
            aligned_labels.append(tag_list[i])
    return aligned_labels

# aligned_labels = align_labels(word_ids, tag_list=df["new_labels"][0])
# aligned_labels = create_aligned_labels(word_ids, example=df)

# print(aligned_labels)
# print(len(aligned_labels))
# print(len(word_ids))

In [191]:
df["word_ids"] = df["text"].apply(lambda x: tokenizer(x, is_split_into_words=True).word_ids())
df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["label"]), axis=1)

In [46]:
list(zip(tokens, aligned_labels))

[('<s>', -100),
 ('▁-', 'O'),
 ('D', 'O'),
 ('OC', 'O'),
 ('ST', 'O'),
 ('ART', 'O'),
 ('-', 'O'),
 ('▁e', 'O'),
 ('volution', 'O'),
 ('▁des', 'O'),
 ('▁salaires', 'O'),
 ('▁de', 'O'),
 ('▁base', 'O'),
 ('▁:', 'O'),
 ('▁enveloppe', 'O'),
 ('▁budgétaire', 'O'),
 ('▁:', 'O'),
 ('▁il', 'O'),
 ('▁est', 'O'),
 ('▁convenu', 'O'),
 ('▁entre', 'O'),
 ('▁les', 'O'),
 ('▁parties', 'O'),
 ('▁que', 'O'),
 ('▁l', 'O'),
 ('’', 'O'),
 ('enveloppe', 'O'),
 ('▁budgétaire', 'O'),
 ('▁consacrée', 'O'),
 ('▁à', 'O'),
 ('▁l', 'O'),
 ('’', 'O'),
 ('évolution', 'O'),
 ('▁des', 'O'),
 ('▁salaires', 'O'),
 ('▁de', 'O'),
 ('▁base', 'O'),
 ('▁des', 'B-TOUS'),
 ('▁collaborateurs', 'I-TOUS'),
 ('▁répondant', 'O'),
 ('▁aux', 'O'),
 ('▁attentes', 'O'),
 ('▁du', 'O'),
 ('▁poste', 'O'),
 ('▁', 'O'),
 (',', 'O'),
 ('▁et', 'O'),
 ('▁remplissant', 'O'),
 ('▁par', 'O'),
 ('▁ailleurs', 'O'),
 ('▁les', 'O'),
 ('▁autres', 'O'),
 ('▁conditions', 'O'),
 ('▁habituelle', 'O'),
 ('s', 'O'),
 ('▁d', 'O'),
 ('’', 'O'),
 ('éligibili

In [92]:
def create_iids_am(df):
    input_ids = []
    attention_mask = []

    for _,i in df.iterrows():
        input = i["ids"]["input_ids"]
        attention = i["ids"]["attention_mask"]

        input_ids.append(input)
        attention_mask.append(attention)

    df["input_ids"] = input_ids
    df["attention_mask"] = attention_mask
    return df

df = create_iids_am(df)
df[["input_ids", "attention_mask"]].head()
# df_ids = df[["ids"]].copy()
# df_ids["ids"][0]

Unnamed: 0,input_ids,attention_mask
0,"[5, 67, 342, 5363, 7486, 11485, 26, 599, 14573...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[5, 17, 12, 16886, 4141, 18, 12, 5314, 20, 601...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[5, 2756, 36, 897, 8, 17, 12, 16286, 8, 117, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[5, 63, 13807, 32, 28, 462, 8, 6016, 1344, 38,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[5, 5996, 18, 12, 1311, 8776, 5998, 3329, 325,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [159]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}
columns = ["formated_text", "new_labels","word_ids", "input_ids", "attention_mask", "aligned_labels"]
new_names = {"formated_text": "text", "new_labels": "label", "word_ids": "word_ids", "aligned_labels": "aligned_labels"}

def tokenize_and_align_data(path_conll:str, new_id:dict):
    #charge and create the df
    df = charge_conll(path_conll)
    df = format_text(df)
    df = format_labels(df)
    df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

    #tokenize and align the text
    df["ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True))
    df = create_iids_am(df)
    df["word_ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True).word_ids())
    df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["new_labels"]), axis=1)
    return df

def apply_tokenization(conll_path:str, new_id:dict, columns:list, new_names:dict, save_path=False, output_save=None):
    df = tokenize_and_align_data(conll_path, new_id=new_id)
    df = select_columns(df, columns)
    df = df.rename(columns=new_names)
    #we check the length of the two columns so that they are of the same dimensions
    print(f"nombre de 'aligned_labels' faux: {count_error(df, 'aligned_labels')}")
    if save_path:
        df.to_csv(output_save, index=False)
    return df

df = apply_tokenization(r"..\..\data\raw\data449.conll", new_id=new_id, columns=columns, new_names=new_names, save_path=True, output_save=r"../../data/intermediate/data499_token.csv")
df.head()
# df = tokenize_and_align_data(r"..\..\data\raw\data449.conll", new_id=new_id)
# df = select_columns(df, columns)
# df = df.rename(columns=new_names)
# #we check the length of the two columns so that they are of the same dimensions
# print(f"nombre de 'aligned_labels' faux: {count_error(df, 'aligned_labels')}")
# # df.to_csv(r"../../data/intermediate/data499_token.csv", index=False)
# df

nombre de 'aligned_labels' faux: 0


Unnamed: 0,text,label,word_ids,input_ids,attention_mask,aligned_labels
0,"[-DOCSTART-, evolution, des, salaires, de, bas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 6, ...","[101, 1011, 9986, 14117, 2102, 1011, 6622, 407...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, ...","[101, 1048, 1521, 4372, 15985, 7361, 5051, 379...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[dispositions, au, regard, de, l’implication, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 1, 2, 3, 4, 4, 4, 5, 6, 6, 7, 8, ...","[101, 22137, 2015, 8740, 7634, 2139, 1048, 152...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[nous, travaillons, sur, une, politique, de, r...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 1, 1, 1, 1, 2, 3, 4, 4, 4, 5, 6, ...","[101, 2053, 2271, 19817, 12462, 20343, 2015, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[protocole, d’accord, négociation, annuelle, o...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, ...","[None, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, ...","[101, 8778, 2063, 1040, 1521, 15802, 11265, 39...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [161]:
list_text = df["text"].tolist()

for i in list_text:
    if len(i) > 512:
        print(len(i))
    else:
        print("ok")

ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
o

In [112]:
# the objective is to collect all the inputs_ids, attention_mask and labels in a list for a given range of sentences and store them into a dictionnary

def final_formating(df, start:int, end:int):
    """
    Collect the data from a dataframe in a list for a given range of sentences and store them into a dictionnary

    Args:
        df (pd.DataFrame): dataframe containing the data
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect

    Returns:
        list: list of the data
    """
    input_ids = []
    attention_mask = []
    labels = []

    for i in range(start, end):
        input_ids.append(df["input_ids"][i])
        attention_mask.append(df["attention_mask"][i])
        labels.append(df["aligned_labels"][i])

    data = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

    return data
    
# data = final_formating(df, 0, 5)
# print(data)
# for _,i in df.iterrows():
    

In [123]:
def download_and_treat_data(conll_path:str, new_id:dict, columns:list, new_names:dict, start:int, end:int):
    """
    This function compile all the previous one and treat automatically all data provided as conll format.\n
    The objective is to provide a dictionnary containing the data for a given range of sentences with only one function. \n
    The same results can be obtained by parts with apply_tokenization and final_formating functions. This one is more a convenience function.

    Args:
        conll_path (str): path to the conll file
        new_id (dict): dictionnary to change the labels
        columns (list): columns to keep
        new_names (dict): new names of the columns
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect
    
    Returns:
        dict: dictionnary containing the data
    """
    df = apply_tokenization(conll_path, new_id=new_id, columns=columns, new_names=new_names)
    data = final_formating(df, start, end)
    return data

In [163]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}
columns = ["formated_text", "new_labels","word_ids", "input_ids", "attention_mask", "aligned_labels"]
new_names = {"formated_text": "text", "new_labels": "label", "word_ids": "word_ids", "aligned_labels": "aligned_labels"}
conll_path = r"..\..\data\raw\data449.conll"
start = 0
end = 449

data = download_and_treat_data(conll_path=conll_path, new_id=new_id, columns=columns, new_names=new_names, start=start, end=end)

nombre de 'aligned_labels' faux: 0


In [164]:
#we divide the data into train, test, and validation sets

def split_data(data_dict, train_size=0.8, test_size=0.1, val_size=0.1, random_seed=None):
    # Set a random seed for reproducibility
    if random_seed is not None:
        random.seed(random_seed)
        np.random.seed(random_seed)

    # Combine input_ids, attention_mask, and aligned_labels into a single list
    combined_data = list(zip(data_dict['input_ids'], data_dict['attention_mask'], data_dict['labels']))

    # Shuffle the data
    random.shuffle(combined_data)

    # Calculate the sizes of each set
    total_size = len(combined_data)
    train_size = int(train_size * total_size)
    test_size = int(test_size * total_size)
    val_size = int(val_size * total_size)

    # Split the data into train, test, and val sets
    train_data = combined_data[:train_size]
    test_data = combined_data[train_size:train_size + test_size]
    val_data = combined_data[train_size + test_size:train_size + test_size + val_size]

    # Unzip the data to restore the original structure
    train_input_ids, train_attention_mask, train_aligned_labels = zip(*train_data)
    test_input_ids, test_attention_mask, test_aligned_labels = zip(*test_data)
    val_input_ids, val_attention_mask, val_aligned_labels = zip(*val_data)

    # Create dictionaries for the train, test, and val sets
    train_set = {
        'input_ids': list(train_input_ids),
        'attention_mask': list(train_attention_mask),
        'labels': list(train_aligned_labels)
    }
    test_set = {
        'input_ids': list(test_input_ids),
        'attention_mask': list(test_attention_mask),
        'labels': list(test_aligned_labels)
    }
    val_set = {
        'input_ids': list(val_input_ids),
        'attention_mask': list(val_attention_mask),
        'labels': list(val_aligned_labels)
    }

    return train_set, test_set, val_set

In [165]:
train_data, test_data, val_data = split_data(data)

we get rid of the list with a dimension superior to 512 as the model doesn't hold more

Now we turn into tensors our data so that BERT can read them

In [166]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
val_dataset = Dataset.from_dict(val_data)

In [194]:
# list of labels
reverse_id = {v: k for k, v in new_id.items()}
second_elements = [value for value in reverse_id.values()]
label_list = second_elements
print(label_list)

['O', 'SYND', 'DIR', 'DATE', 'ENT', 'CAD', 'INT', 'OUV', 'NCAD', 'NOUV', 'TOUS', 'AG CAD', 'AG INT', 'AG OUV', 'AG NCAD', 'AG NOUV', 'AI CAD', 'AI INT', 'AI OUV', 'AI NCAD', 'AI NOUV', 'AG', 'AI', 'ATOT', 'ATOT CAD', 'ATOT INT', 'ATOT OUV', 'ATOT NCAD', 'ATOT NOUV', 'PPV', 'PPVm']


In [199]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner", num_labels=len(label_list), ignore_mismatched_sizes=True) #this last argument might be a mistake

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([31]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([31, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [152]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='pt')

In [154]:
data_collator(train_dataset)



{'input_ids': tensor([[  101, 11265,  3995,  ...,     0,     0,     0],
        [  101,  2033, 28632,  ...,     0,     0,     0],
        [  101, 11265,  3995,  ...,     0,     0,     0],
        ...,
        [  101, 15802,  9530,  ...,     0,     0,     0],
        [  101,  8292,  4674,  ...,     0,     0,     0],
        [  101, 15802,  8145,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        ...,
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])}

In [167]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

test_dataloader = DataLoader(
    test_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

val_dataloader = DataLoader(
    val_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

In [168]:
next(iter(train_dataloader))



{'input_ids': tensor([[  101, 11265,  3995,  ...,  2603,  1012,   102],
        [  101, 15802,  3523,  ...,  1048,  1521,   102],
        [  101, 15802, 28573,  ..., 12032,  6648,   102],
        ...,
        [  101,  3393, 16183,  ...,  3370,  3802,   102],
        [  101, 15802, 11265,  ...,  2102,  4372,   102],
        [  101, 11265,  3995,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100],
        ...,
        [-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])}

In [197]:
model(**(next(iter(train_dataloader))))



TokenClassifierOutput(loss=tensor(3.4393, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.0660,  0.1158, -0.2266,  ...,  0.1506, -0.1803,  0.0611],
         [-0.0005,  0.1099, -0.2516,  ...,  0.1760, -0.1128,  0.2022],
         [ 0.0554,  0.1673, -0.3202,  ...,  0.1732, -0.0896,  0.1673],
         ...,
         [ 0.0371,  0.1019, -0.2497,  ...,  0.1846, -0.1285,  0.2170],
         [ 0.0308,  0.1380, -0.3178,  ...,  0.1385, -0.1088,  0.1669],
         [ 0.0698,  0.1126, -0.2297,  ...,  0.1580, -0.1785,  0.0630]],

        [[ 0.0830,  0.1151, -0.2358,  ...,  0.1582, -0.1713,  0.0922],
         [ 0.0167,  0.0629, -0.2582,  ...,  0.2050, -0.1119,  0.2376],
         [ 0.0467,  0.0902, -0.2634,  ...,  0.2100, -0.0909,  0.2062],
         ...,
         [ 0.0830,  0.1151, -0.2358,  ...,  0.1582, -0.1713,  0.0922],
         [ 0.0830,  0.1151, -0.2358,  ...,  0.1582, -0.1713,  0.0922],
         [ 0.0849,  0.1131, -0.2383,  ...,  0.1637, -0.1713,  0.0919]],

        [[ 0.0602,  0.1345, -0.2245,  

Training Argument

In [204]:
from transformers import TrainingArguments

batch_size = 8

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

#### drafty draft

In [37]:
df_token = df[['input_ids']].copy()
print(df_token["input_ids"][0]["input_ids"])

[5, 67, 342, 5363, 7486, 11485, 26, 599, 14573, 20, 8058, 8, 494, 43, 10182, 11558, 43, 51, 30, 13646, 128, 19, 1503, 27, 17, 12, 16886, 11558, 5670, 15, 17, 12, 2010, 20, 8058, 8, 494, 20, 4891, 7826, 68, 3306, 25, 1337, 21, 7, 14, 22745, 37, 1044, 19, 214, 643, 10121, 10, 18, 12, 28806, 15, 28, 3708, 21, 7, 5826, 55, 205, 7, 929, 453, 8, 13, 2269, 16054, 9, 8, 82, 21, 7, 19, 17218, 16850, 10, 22, 8001, 25, 4360, 8, 494, 3138, 263, 163, 22, 93, 125, 27, 16, 4360, 20, 4891, 5552, 9, 24, 3232, 21, 7, 4953, 15, 13, 462, 16054, 18, 12, 35, 1187, 17, 5337, 537, 642, 35, 21, 7, 16, 20692, 297, 759, 28, 1141, 1627, 15, 17, 12, 2010, 25, 4360, 8, 494, 8, 58, 18010, 29, 58, 6230, 21, 7, 22, 3184, 22, 8590, 43, 77, 2283, 32, 17, 12, 520, 77, 2283, 29, 13, 1083, 1363, 13, 2283, 20, 4412, 8733, 36, 93, 359, 8, 1337, 16, 459, 128, 77, 1266, 29, 17, 12, 4515, 8, 4360, 14, 77, 2283, 29, 13, 1083, 38, 951, 169, 350, 8, 325, 3009, 21, 7, 23, 18010, 45, 7826, 34, 68, 3306, 38, 165, 16918, 10, 936, 575,

In [207]:
print(df["aligned_labels"][0])
df_test = df["aligned_labels"].apply(lambda x :change_ids(x, new_id=new_id))
print(df_test[0])

[-100, 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TOUS', 'I-TOUS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ATOT', 'B-ATOT', 'B-ATOT', 'I-ATOT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ENT', 'B-ENT', 'B-ENT', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [30]:
example = df.iloc[0]

In [34]:
tokenized_input = tokenizer(example["text"], is_split_into_words=True)
aligned_labels = [-100 if i is None else example[f"label"][i] for i in word_ids]
print(example["label"])
print(len(word_ids))
print(len(aligned_labels))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TOUS', 'I-TOUS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ATOT', 'I-ATOT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [45]:
list(zip(tokens_mod, aligned_labels))

NameError: name 'tokens_mod' is not defined

## others

Load a dataset for training from Huggingface

In [246]:
# If you like it you can download it
data = import_label_studio_data("../../data/raw/data449.json")

df = pd.DataFrame(data, columns = ['text', 'label'])

In [None]:
df.head()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

In [243]:
def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of tokens

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of tokens
    """
    return [token for token in sentence.split(" ")]

In [247]:
for i in range(len(df["text"])):
    df["text"][i] = turn_sentence_to_list(df["text"][i])

df.head()

Unnamed: 0,text,label
0,"[evolution, des, salaires, de, base, :, envelo...","{'entities': [(322, 326, 'ATOT'), (161, 179, '..."
1,"[l’enveloppe, globale, d’augmentation, des, ré...","{'entities': [(229, 237, 'OUV'), (239, 247, 'O..."
2,"[dispositions, au, regard, de, l’implication, ...","{'entities': [(101, 105, 'SYND'), (110, 122, '..."
3,"[nous, travaillons, sur, une, politique, de, r...","{'entities': [(165, 172, 'SYND'), (364, 371, '..."
4,"[protocole, d’accord, négociation, annuelle, o...","{'entities': [(73, 82, 'DIR'), (108, 119, 'ENT..."


In [None]:
length = []

for i in range(len(df["text"])):
    element = df["text"][i]
    length.append(len(element))
    # print(len(df["text"][i])) 

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors

plt.hist(length, bins=20, color = "lightgreen", edgecolor='black') 

# Add labels and title
plt.xlabel('Longeur (élément)')
plt.ylabel('Fréquence')
plt.title('Histogram de la longueurs des phrases en élément')

# Show the histogram
plt.show()


In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'text': ["evolution des salaires de base : enveloppe"],
    'label': [{'entities': [(22, 31, 'ATOT')]}]
}

df = pd.DataFrame(data)

# Function to tokenize text while preserving whitespace
def tokenize_text(row):
    text = row['text']
    tokens = []
    start = 0

    for start, end, entity in row['label']['entities']:
        # Add non-entity text
        tokens.extend(text[start:end].split())
        start = end

    # Add any remaining text after the last entity
    tokens.extend(text[start:].split())
    
    return tokens

# Apply the function to create a new column
df['tokenized_text'] = df.apply(tokenize_text, axis=1)

# Display the DataFrame with tokenized text while preserving whitespace
print(df)


In [None]:
for i in range(len(df["label"])):
    print(df["label"][i]["entities"])
    


In [None]:
text = df['text'][0]
text = turn_sentence_to_list(text)
# print(type(text))
tokens = tokenizer(text, is_split_into_words=True)
# print(tokens)

tokens_mod = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens_mod)


In [None]:
word_ids = tokens.word_ids()
# aligned_labels = [-100 if i is None else text["label"][i] for i in word_ids]
for i in word_ids:
    # print(i)
    if i is None:
        print(i)
    else:
        print(text["label"][i])

In [None]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)