# Finetuning of BERT for NER task

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import random
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.utils.data import DataLoader


## Preprocessing Data

In [None]:
# Load the tokenizer and model
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner") #this last argument might be a mistake

In [None]:
def charge_conll(path:str):
    # Initialize a list to store the data
    conll_data = []

    # Open the CoNLL file in read mode
    with open(path, 'r', encoding='utf-8') as file:
        conll_data = file.read()
        # Read each line in the file

    # Split the data into sentences
    sentences = conll_data.strip().split('\n\n')

    # Initialize empty lists to store text and labels
    text_list = []
    labels_list = []

    # Process each sentence
    for sentence in sentences:
        tokens = sentence.split('\n')
        text = " ".join(token.split()[0] for token in tokens)
        labels = " ".join(token.split()[-1] for token in tokens)
        text_list.append(text)
        labels_list.append(labels)

    # Create a dataframe
    df = pd.DataFrame({'text': text_list, 'label': labels_list})

    return df

#we divide the data into train, test, and validation sets

def split_data(data_dict, train_size=0.8, test_size=0.1, val_size=0.1, random_seed=None):
    # Set a random seed for reproducibility
    if random_seed is not None:
        random.seed(random_seed)
        np.random.seed(random_seed)

    # Combine input_ids, attention_mask, and aligned_labels into a single list
    combined_data = list(zip(data_dict['input_ids'], data_dict['attention_mask'], data_dict['labels']))

    # Shuffle the data
    random.shuffle(combined_data)

    # Calculate the sizes of each set
    total_size = len(combined_data)
    train_size = int(train_size * total_size)
    test_size = int(test_size * total_size)
    val_size = int(val_size * total_size)

    # Split the data into train, test, and val sets
    train_data = combined_data[:train_size]
    test_data = combined_data[train_size:train_size + test_size]
    val_data = combined_data[train_size + test_size:train_size + test_size + val_size]

    # Unzip the data to restore the original structure
    train_input_ids, train_attention_mask, train_aligned_labels = zip(*train_data)
    test_input_ids, test_attention_mask, test_aligned_labels = zip(*test_data)
    val_input_ids, val_attention_mask, val_aligned_labels = zip(*val_data)

    # Create dictionaries for the train, test, and val sets
    train_set = {
        'input_ids': list(train_input_ids),
        'attention_mask': list(train_attention_mask),
        'labels': list(train_aligned_labels)
    }
    test_set = {
        'input_ids': list(test_input_ids),
        'attention_mask': list(test_attention_mask),
        'labels': list(test_aligned_labels)
    }
    val_set = {
        'input_ids': list(val_input_ids),
        'attention_mask': list(val_attention_mask),
        'labels': list(val_aligned_labels)
    }

    return train_set, test_set, val_set

def turn_sentence_to_list(sentence):
    """
    Turn a sentence into a list of words

    Args:
        sentence (str): sentence to tokenize

    Returns:
        list: list of words
    """
    return sentence.split()

def format_text(df):
    formated_text = []

    text = df["text"]
    for i in text:
        i = turn_sentence_to_list(i)
        formated_text.append(i)

    # we add it to the dataframe
    df["formated_text"] = formated_text
    return df

def format_labels(df):
    labels = df["label"].tolist()

    formated_labels = []

    for label in labels:
        # label = list_to_string(label)
        label = turn_sentence_to_list(label)
        formated_labels.append(label)
        # print(label)

    # we add it to the dataframe

    df["formated_labels"] = formated_labels
    return df

def change_ids(list_ids:list, new_id:dict):
    """
    change the ids of a list of ids to a new id

    Args:
        list_ids (list): list of ids
        new_id (dict): dictionary with the new id
    """
    return [new_id.get(id) for id in list_ids]

def align_labels(word_ids:list, tag_list:list):
    """
    Align the labels with the words

    Args:
        word_ids (list): list of ids of the words
        tag_list (list): list of tags
    """
    aligned_labels = []
    for i in word_ids:
        if i is None:
            aligned_labels.append(-100)
        else:
            aligned_labels.append(tag_list[i])
    return aligned_labels

def check_labels(list_labels:list):
    for label in list_labels:
        if type(label)!=int:
            print(label)
            return False
    return True

def count_error(df, column:str):
    """
    Count the number of errors - a cell containing a non numerical value - in a column of a dataframe

    Args:
        dataframe (pd.DataFrame): dataframe to check
        column (str): column to check

    Returns:
        int: number of errors
    """
    return len(df[column])- sum(df[column].apply(lambda x: check_labels(x)))

def create_iids_am(df):
    """
    Create the input ids and attention mask columns
    
    Args:
        df (pd.DataFrame): dataframe containing the data

    Returns:
        pd.DataFrame: dataframe containing the data with the input ids and attention mask columns
    """
    input_ids = []
    attention_mask = []

    for _,i in df.iterrows():
        input = i["ids"]["input_ids"]
        attention = i["ids"]["attention_mask"]

        input_ids.append(input)
        attention_mask.append(attention)

    df["input_ids"] = input_ids
    df["attention_mask"] = attention_mask
    return df

def select_columns(df, columns:list):
    """
    Select columns from a dataframe

    Args:
        df (pd.DataFrame): dataframe
        columns (list): list of columns to select

    """
    df = df[columns].copy()
    return df

def tokenize_and_align_data(path_conll:str, new_id:dict):
    """
    Tokenize the text and align the labels

    Args:
        path_conll (str): path to the conll file
        new_id (dict): dictionary with the new id

    Returns:
        pd.DataFrame: dataframe containing the data
    """
    #charge and create the df
    df = charge_conll(path_conll)
    df = format_text(df)
    df = format_labels(df)
    df["new_labels"]=df["label"].apply(lambda x: change_ids(x.split(" "), new_id))

    #tokenize and align the text
    df["ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True))
    df = create_iids_am(df)
    df["word_ids"] = df["formated_text"].apply(lambda x: tokenizer(x, truncation=True, is_split_into_words=True).word_ids())
    df["aligned_labels"] = df.apply(lambda x: align_labels(x["word_ids"], x["new_labels"]), axis=1)
    return df

def apply_tokenization(conll_path:str, new_id:dict, columns:list, new_names:dict, save_path=False, output_save_csv=None):
    """
    This function apply the tokenization and alignment to a conll file and select the columns we want to keep

    Args:
        conll_path (str): path to the conll file
        new_id (dict): dictionary with the new id
        columns (list): list of columns to keep
        new_names (dict): dictionary with the new names of the columns
        save_path (bool, optional): if True, save the dataframe in a csv file. Defaults to False.
        output_save_csv ([type], optional): path to the csv file. Defaults to None.

    Returns:
        pd.DataFrame: dataframe containing the data
    """
    df = tokenize_and_align_data(conll_path, new_id=new_id)
    df = select_columns(df, columns)
    df = df.rename(columns=new_names)
    #we check the length of the two columns so that they are of the same dimensions
    print(f"nombre de 'aligned_labels' faux: {count_error(df, 'aligned_labels')}")
    if save_path:
        df.to_csv(output_save_csv, index=False)
    return df

def final_formating(df, start:int, end:int):
    """
    Collect the data from a dataframe in a list for a given range of sentences and store them into a dictionnary

    Args:
        df (pd.DataFrame): dataframe containing the data
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect

    Returns:
        dict: dictionnary containing the data
    """
    input_ids = []
    attention_mask = []
    labels = []

    for i in range(start, end):
        input_ids.append(df["input_ids"][i])
        attention_mask.append(df["attention_mask"][i])
        labels.append(df["aligned_labels"][i])

    data = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

    return data

def download_and_treat_data(conll_path:str, new_id:dict, columns:list, new_names:dict, start:int, end:int):
    """
    This function compile all the previous one and treat automatically all data provided as conll format.\n
    The objective is to provide a dictionnary containing the data for a given range of sentences with only one function. \n
    The same results can be obtained by parts with apply_tokenization and final_formating functions. This one is more a convenience function.

    Args:
        conll_path (str): path to the conll file
        new_id (dict): dictionnary to change the labels
        columns (list): columns to keep
        new_names (dict): new names of the columns
        start (int): index of the first sentence to collect
        end (int): index of the last sentence to collect
    
    Returns:
        dict: dictionnary containing the data
    """
    df = apply_tokenization(conll_path, new_id=new_id, columns=columns, new_names=new_names)
    data = final_formating(df, start, end)
    return data


In [None]:
new_id = {"O":0,"B-SYND":1, "I-SYND":1, "SYND":1, "B-DIR":2, "I-DIR":2, "DIR":2, "B-DATE":4, "I-DATE":4, "DATE":4, "B-ENT":3, "I-ENT":3, "ENT":3, "B-CAD":5, "I-CAD":5, "CAD":5, "B-INT":6, "I-INT":6, "INT":6,"B-OUV":7, "I-OUV":7, "OUV":7, "B-NCAD":8, "I-NCAD":8, "NCAD":8,"B-NOUV":9, "I-NOUV":9, "NOUV":9, "B-TOUS":10,"I-TOUS":10, "TOUS":10, "B-AG CAD":11,"I-AG CAD":11, "AG CAD":11, "B-AG INT":12, "I-AG INT":12, "AG INT":12, "B-AG OUV":13, "I-AG OUV":13, "AG OUV":13,"B-AG NCAD":14, "I-AG NCAD":14, "AG NCAD":14, "B-AG NOUV":15, "I-AG NOUV":15, "AG NOUV":15,"B-AI CAD":16, "I-AI CAD":16,"AI CAD":16,"B-AI INT":17, "I-AI INT":17, "AI INT":17,"B-AI OUV":18, "I-AI OUV":18,"AI OUV":18,"B-AI NCAD":19, "I-AI NCAD":19,"AI NCAD":19,"B-AI NOUV":20, "I-AI NOUV":20, "AI NOUV":20, "B-AG":21, "I-AG":21, "AG":21,"B-AI":22, "I-AI":22,"AI":22,"B-ATOT":23,"I-ATOT":23, "ATOT":23,"B-ATOT CAD":24, "I-ATOT CAD":24, "ATOT CAD":24,"B-ATOT INT":25, "I-ATOT INT":25,"ATOT INT":25,"B-ATOT OUV":26, "I-ATOT OUV":26, "ATOT OUV":26,"B-ATOT NCAD":27, "I-ATOT NCAD":27, "ATOT NCAD":27,"B-ATOT NOUV":28, "I-ATOT NOUV":28, "ATOT NOUV":28,"B-PPV":29, "I-PPV":29, "PPV":29,"B-PPVm":30, "I-PPVm":30, "PPVm":30}
columns = ["formated_text", "new_labels", "word_ids", "aligned_labels"]
new_names = {"formated_text": "text", "new_labels": "label", "word_ids": "word_ids", "aligned_labels": "aligned_labels"}
output_save_csv = r"../../data/intermediate/data499_token.csv"

data = download_and_treat_data(conll_path=r"../../data/raw/data499.conll", new_id=new_id, columns=columns, new_names=new_names, start=0, end=499)

train_data, test_data, val_data = split_data(data)

# Create a Dataset object
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
val_dataset = Dataset.from_dict(val_data)

# Create a list of labels
reverse_id = {v: k for k, v in new_id.items()}
second_elements = [value for value in reverse_id.values()]
label_list = second_elements

# Load the tokenizer and model
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner", num_labels=len(label_list), ignore_mismatched_sizes=True) #this last argument might be a mistake

# import data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='pt')

# Dataloader
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

test_dataloader = DataLoader(
    test_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

val_dataloader = DataLoader(
    val_dataset,
    collate_fn=data_collator,
    batch_size=8,
)

next(iter(train_dataloader))

model(**(next(iter(train_dataloader))))

## Training Argument