# Finetuning Spacy

We are finetuning SpaCy model since it is already made for NER. This notebook is based on this website : https://www.freecodecamp.org/news/how-to-fine-tune-spacy-for-nlp-use-cases/

In [None]:
import json
import numpy as np
import os

In [None]:
ROOT_PATH = r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files"

In [None]:
# https://www.kaggle.com/code/kiruthigaa/ner-model-train-test-using-spacy-label-studio

def import_label_studio_data(filename):
    """
    This function imports the data from label-studio and converts it into the format required by spacy.
    """
    TRAIN_DATA = []
    
    with open(filename,'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind =[]
            for i in range(0,len(a[:,0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1]+1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)
                    
            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop=0
                else:
                    if label.get('labels') is not None:
                        entities.append((label.get('start'), label.get('end') ,label.get('labels')[0]))
        TRAIN_DATA.append((info, {"entities" : entities}))
    return TRAIN_DATA

In [None]:
data = import_label_studio_data(os.path.join(ROOT_PATH, r"data/training_json/data449.json"))
data[0:10]

****
This alternative function is here to select only a specific label

In [None]:
import json
import numpy as np

def import_label_studio_data(filename, target_label):
    """
    This function imports the data from Label Studio JSON file and returns the data in the format required for training.
    It also allows to select a specific label to train the model on with the "target_label" argument.
    """

    TRAIN_DATA = []  # Initialize TRAIN_DATA 
    
    with open(filename, 'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        entities = []
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind = []
            for i in range(0, len(a[:, 0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1] + 1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)

            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop = 0
                else:
                    if label.get('labels') == [target_label]:
                        entities.append((label.get('start'), label.get('end'), label.get('labels')[0]))
        
        if entities:  # Proceed only if there are non-empty entities
            TRAIN_DATA.append((info, {"entities": entities}))

    return TRAIN_DATA

# Call the function with the filename
data = import_label_studio_data(os.path.join(ROOT_PATH, r"\data\training_json\data360.json"), "OUV")
data

In [None]:
import json
import numpy as np

def import_label_studio_data(filename, target_labels):
    """
    This function imports the data from Label Studio JSON file and returns the data in the format required for training.
    It also allows to select specific labels to train the model on with the "target_labels" argument.
    """

    if not isinstance(target_labels, list):
        raise ValueError("The 'target_labels' argument must be a list of strings.")

    TRAIN_DATA = []  # Initialize TRAIN_DATA
    
    with open(filename, 'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        entities = []
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind = []
            for i in range(0, len(a[:, 0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1] + 1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)

            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop = 0
                else:
                    if any(target in label.get('labels') for target in target_labels):
                        entities.append((label.get('start'), label.get('end'), label.get('labels')[0]))
        
        if entities:  # Proceed only if there are non-empty entities
            TRAIN_DATA.append((info, {"entities": entities}))

    return TRAIN_DATA

# Call the function with the filename and a list of target labels
target_labels = ['PPV']  # Add your target labels here

# all = ['OUV', 'INT', 'CAD', 'NOUV', 'NCAD', 'AG', 'AI', 'TOUS', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',\
#        'ATOT OUV', 'ATOT INT', 'ATOT CAD', 'ATOT NOUV', 'PPV', 'PPVm', 'DATE']

data = import_label_studio_data(os.path.join(ROOT_PATH,r"data\training_json\data449.json"), target_labels)
data


****

### Finetune of SpaCy

In the first code block we will import all the necessary library to finetune Spacy and load a blank pipeline as well as our data

In [None]:
import spacy

# This load a blank pipeline in spacy model, we will model it to our needs : https://spacy.io/api/top-level
nlp = spacy.blank("fr")

In [None]:
# found on https://stackoverflow.com/questions/56642816/valueerror-e024-could-not-find-an-optimal-move-to-supervise-the-parser

import re

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

TRAIN_DATAS = trim_entity_spans(data)

In [None]:
import random

def split_dataset(dataset, train_percentage, val_percentage, seed=None):
    """
    Split a dataset into training, validation, and test sets based on the provided percentages.
    
    Args:
    dataset (list): A list of tuples, where each tuple contains a text and its annotations.
    train_percentage (float): The percentage of data to be allocated for training.
    val_percentage (float): The percentage of data to be allocated for validation.
    seed (int) [Default: None]: Seed value for randomization.
    
    Returns:
    tuple: A tuple containing the training dataset, validation dataset, and test dataset.
    """
    if seed is not None:
        random.seed(seed)
    
    random.shuffle(dataset)
    
    train_index = int(len(dataset) * train_percentage)
    val_index = int(len(dataset) * (train_percentage + val_percentage))
    
    train_data = dataset[:train_index]
    val_data = dataset[train_index:val_index]
    test_data = dataset[val_index:]
    
    return train_data, val_data, test_data

# Example usage:
train_data, val_data, test_data = split_dataset(TRAIN_DATAS, 0.7, 0.2)

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

Now, we will generate a train version of spacy tailored to our labels and trained to recognize them.

In [None]:
from spacy.tokens import DocBin
from spacy.util import filter_spans

# We create the document to finetune spacy model

doc_bin = DocBin()  # Instantiate the DocBin class

for item in val_data:
    # print(item)
    text = item[0]
    labels = item[1]["entities"]
    # print(labels)
    # print(text, labels)
    doc = nlp.make_doc(text) 
    ents = []
    # print(labels)
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
            print(ents)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    # print(doc.ents)
    doc_bin.add(doc)

doc_bin.to_disk(os.path.join(ROOT_PATH, r"model\unilabel\ATOT\val.spacy"))

We now create the config file to train the model

In [None]:
!python -m spacy init fill-config C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\train_spacy\multilabel\nlp_V8\base_config.cfg C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\train_spacy\multilabel\nlp_V8\config.cfg

We can now train the model on our data. This operation can take several depending on the power of the computer (ideally you want to run it on you GPU if you pocess a good one)

In [None]:
!python -m spacy train "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT\config.cfg" --output "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT" --paths.train "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT\train.spacy" --paths.dev "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT\val.spacy

We can evaluate the model

In [None]:
!python -m spacy evaluate "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT\model-best" "C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\unilabel\ATOT\test.spacy"

Normally two folder have appeared : "model-best" and "model-last". Select "model-best" to test it on your data set.

In [None]:
import spacy
import glob
from spacy import displacy

directory = glob.glob(os.path.join(ROOT_PATH,r"\txt\txt_cleaned\*.txt"))

nlp_ner = spacy.load(os.path.join(ROOT_PATH,r"model\unilabel\PPVm\model-best")) #select another model to test it

for item in directory:
    with open(item, "r", encoding = "utf-8") as file:
        text = file.read()

    doc = nlp_ner(text)
    displacy.render(doc, style="ent", jupyter=True) 

In [None]:
import pandas as pd
import os

#function to write txt files from a list
def write_txt(liste, path):
    for i, item in enumerate(liste, start=1):
        with open(f"{path}\{i}.txt", "w", encoding="utf-8") as file:
            file.write(item)

def extract_txt_from_csv(path_csv, path_txt):
    df = pd.read_csv(path_csv)
    df = df["text"].tolist()
    write_txt(df, path_txt)

extract_txt_from_csv(os.path.join(ROOT_PATH, r"data\training_csv\data449_cats.csv"), os.path.join(ROOT_PATH,r"txt\draft"))

In [None]:
import spacy
import glob
import json


directory = glob.glob(os.path.join(ROOT_PATH,r"txt\draft\*.txt"))

nlp_ner = spacy.load(os.path.join(ROOT_PATH,r"unilabel\PPV_V2\model-best"))  # Select another model to test it

# Initialize an empty list to store the output data
output_data = []

# Loop through items in the directory
for item in directory:
    with open(item, "r", encoding="utf-8") as file:
        text = file.read()

    doc = nlp_ner(text)
    
    # Extract labeled information
    labeled_data = [
        {
            "start": ent.start_char,
            "end": ent.end_char,
            "labels": ent.label_
        }
        for ent in doc.ents
    ]

    # Store text and labeled_data as a tuple and append to the output_data list
    output_data.append((text, {"label": labeled_data}))

# Save the labeled data to a JSON file
with open(os.path.join(ROOT_PATH,r"data\predicted_json\labeled_data_PPV.json"), "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

In [None]:
with open(os.path.join(ROOT_PATH,r"data\predicted_json\labeled_data_PPV.json"), "r", encoding="utf-8") as json_file:
    a= json.load(json_file)

a