# Finetuning Spacy

We are finetuning SpaCy model since it is already made for NER. This notebook is based on this website : https://www.freecodecamp.org/news/how-to-fine-tune-spacy-for-nlp-use-cases/

In [None]:
import json
import numpy as np
import os
import spacy
import re
import random
from spacy.tokens import DocBin
from spacy.util import filter_spans

In [None]:
ROOT_PATH = r".."

In [None]:
# https://www.kaggle.com/code/kiruthigaa/ner-model-train-test-using-spacy-label-studio

def import_label_studio_data(filename):
    """
    This function imports the data from label-studio and converts it into the format required by spacy.
    """
    TRAIN_DATA = []
    
    with open(filename,'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind =[]
            for i in range(0,len(a[:,0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1]+1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)
                    
            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop=0
                else:
                    if label.get('labels') is not None:
                        entities.append((label.get('start'), label.get('end') ,label.get('labels')[0]))
        TRAIN_DATA.append((info, {"entities" : entities}))
    return TRAIN_DATA

# found on https://stackoverflow.com/questions/56642816/valueerror-e024-could-not-find-an-optimal-move-to-supervise-the-parser


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

def split_dataset(dataset, train_percentage, val_percentage, seed=None):
    """
    Split a dataset into training, validation, and test sets based on the provided percentages.
    
    Args:
    dataset (list): A list of tuples, where each tuple contains a text and its annotations.
    train_percentage (float): The percentage of data to be allocated for training.
    val_percentage (float): The percentage of data to be allocated for validation.
    seed (int) [Default: None]: Seed value for randomization.
    
    Returns:
    tuple: A tuple containing the training dataset, validation dataset, and test dataset.
    """
    if seed is not None:
        random.seed(seed)
    
    random.shuffle(dataset)
    
    train_index = int(len(dataset) * train_percentage)
    val_index = int(len(dataset) * (train_percentage + val_percentage))
    
    train_data = dataset[:train_index]
    val_data = dataset[train_index:val_index]
    test_data = dataset[val_index:]
    
    return train_data, val_data, test_data

def create_spacy_model(data,name_file_model):
    # This load a blank pipeline in spacy model, we will model it to our needs : https://spacy.io/api/top-level
    nlp = spacy.blank("fr")
    doc_bin = DocBin()  # Instantiate the DocBin class
    
    for item in data:
        text = item[0]
        labels = item[1]["entities"]
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                continue
                #print("Skipping entity")
            else:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)
    
    doc_bin.to_disk(os.path.join(MODEL_PATH,name_file_model))

****

### Finetune of SpaCy

In the first code block we will import all the necessary library to finetune Spacy and load a blank pipeline as well as our data

In [None]:
all = ['OUV', 'INT', 'CAD', 'NOUV', 'NCAD', 'AG', 'AI', 'TOUS', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',\
        'ATOT OUV', 'ATOT INT', 'ATOT CAD', 'PPV', 'PPVm', 'DATE']

for LABEL in all : 
    print(f"Training LABEL = {LABEL}")
    MODEL_PATH=os.path.join(ROOT_PATH,"models/NER",f"{LABEL.replace(' ','_')}/")
    !mkdir -p {MODEL_PATH} 2> /dev/null

    data = import_label_studio_data(os.path.join(ROOT_PATH, r"data/raw/data449.json"))
    TRAIN_DATAS = trim_entity_spans(data)
    train_data, val_data, test_data = split_dataset(TRAIN_DATAS, 0.7, 0.2)
    
    print(f"Training data size: {len(train_data)}")
    print(f"Validation data size: {len(val_data)}")
    print(f"Test data size: {len(test_data)}")
    
    # We create the document to finetune spacy model
    
    create_spacy_model(train_data,"train.spacy")
    create_spacy_model(val_data,"val.spacy")
    create_spacy_model(test_data,"test.spacy")

    !python -m spacy init config  --lang fr --pipeline ner {MODEL_PATH}\config.cfg
    !python -m spacy train {MODEL_PATH}\config.cfg --output {MODEL_PATH} --paths.train {MODEL_PATH}\train.spacy --paths.dev {MODEL_PATH}\val.spacy