# Spacy Text Categorizer

The objective here will be to build a french langage categorizer suited to distinguish different kind of wage agreement components.

Some inspiring sources:
- https://medium.com/@johnidouglasmarangon/building-a-text-classification-model-with-spacy-3-x-57e59fa50547
- https://www.machinelearningplus.com/nlp/custom-text-classification-spacy/
- https://www.width.ai/post/spacy-text-classification

We first need to adapt the data from label studio to the required format for spacy TextCategorizer

In [5]:
import json
import numpy as np

def import_label_studio_data(filename, target_labels):
    """
    This function imports the data from Label Studio JSON file and returns the data in the format required for training.
    It also allows to select specific labels to train the model on with the "target_labels" argument.
    """

    if not isinstance(target_labels, list):
        raise ValueError("The 'target_labels' argument must be a list of strings.")

    TRAIN_DATA = []  # Initialize TRAIN_DATA
    
    with open(filename, 'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        entities = []
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind = []
            for i in range(0, len(a[:, 0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1] + 1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)

            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop = 0
                else:
                    if any(target in label.get('labels') for target in target_labels):
                        entities.append((label.get('start'), label.get('end'), label.get('labels')[0]))
        
        if entities:  # Proceed only if there are non-empty entities
            TRAIN_DATA.append((info, {"entities": entities}))

    return TRAIN_DATA

all = ['OUV', 'INT', 'CAD', 'NOUV', 'NCAD', 'AG', 'AI', 'TOUS', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',\
        'ATOT OUV', 'ATOT INT', 'ATOT CAD', 'PPV', 'PPVm', 'DATE']

# Call the function with the filename and a list of target labels
target_labels = ['OUV', 'INT', 'CAD', 'NOUV', 'NCAD', 'AG', 'AI', 'TOUS', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',\
        'ATOT OUV', 'ATOT INT', 'ATOT CAD', 'PPV', 'PPVm', 'DATE'] # Add your target labels here

data = import_label_studio_data(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\data\training_json\data449.json", target_labels)
data

[('evolution des salaires de base : enveloppe budgétaire : il est convenu entre les parties que l’enveloppe budgétaire consacrée à l’évolution des salaires de base des collaborateurs répondant aux attentes du poste, et remplissant par ailleurs les autres conditions habituelles d’éligibilité à une augmentation, représentera 5,8% de la masse salariale. de fait, les primes exprimées en pourcentage du salaire de base augmenteront ainsi en même temps que le salaire des collaborateurs concernés. pour rappel, conformément à la politique salariale d’eli lilly & cie, le superviseur prend une décision quant à l’évolution du salaire de base de son collaborateur dans son échelle, en prenant en considération : sa performance sur l’année sa performance dans la durée* la performance des collègues situés au même niveau de poste le rapport entre sa position dans l’échelle de salaire et sa performance dans la durée (*à partir de 2023, un collaborateur ne répondant pas aux attentes ( non éligibles)  pour

In [6]:
import pandas as pd

def spacy_to_dataframe(data):
    """
    This function takes the data in the format returned by the import_label_studio_data function and returns a pandas dataframe of two columns: text and label.

    Args:
        data: The data in the format returned by the import_label_studio_data function.

    Returns:
        A pandas dataframe of two columns: text and label.
    """
    text_data = [text for text, _ in data]
    labels = [label for _, label in data]

    df = pd.DataFrame({'text': text_data, 'label': labels})
    return df

df = spacy_to_dataframe(data)
df.head()

Unnamed: 0,text,label
0,evolution des salaires de base : enveloppe bud...,"{'entities': [(322, 326, 'ATOT'), (161, 179, '..."
1,l’enveloppe globale d’augmentation des rémunér...,"{'entities': [(229, 237, 'OUV'), (239, 247, 'O..."
2,dispositions au regard de l’implication de tou...,"{'entities': [(549, 553, 'AI'), (1383, 1412, '..."
3,nous travaillons sur une politique de rémunéra...,"{'entities': [(835, 864, 'PPV'), (868, 872, 'P..."
4,protocole d’accord négociation annuelle obliga...,"{'entities': [(1120, 1142, 'TOUS'), (1269, 127..."


In [10]:
def dummy_label(df):
    """
    This function creates a dummy variable for the target label.

    Args:
        df (DataFrame): The DataFrame containing the text and label columns.
    """
    # Create a new column called "label_dummy" and initialize with zeros
    df["label_dummy"] = 0

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        labels = row["label"]["entities"]  # Access the entities list in the tuple
        for label in labels:
            target = label[2]
            if target == "OUV":
                df.at[index, "label_dummy"] = 1  # Set the value to 1 for the current row

    # Print the DataFrame to verify the changes
    print(df["label_dummy"].value_counts())
    return df

df = dummy_label(df)

[(322, 326, 'ATOT'), (161, 179, 'TOUS'), (1820, 1833, 'DATE')]
[(229, 237, 'OUV'), (239, 247, 'OUV'), (205, 210, 'AI OUV'), (355, 361, 'CAD'), (330, 336, 'AI CAD'), (413, 421, 'OUV'), (423, 431, 'OUV'), (519, 524, 'AG OUV'), (251, 277, 'OUV'), (280, 285, 'AI INT'), (304, 327, 'INT'), (435, 461, 'OUV'), (839, 862, 'INT'), (920, 925, 'AG INT'), (1240, 1246, 'CAD'), (1304, 1310, 'AG CAD'), (1699, 1707, 'OUV'), (1689, 1697, 'OUV'), (1709, 1735, 'OUV'), (1739, 1762, 'INT'), (1905, 1911, 'CAD'), (2102, 2107, 'AG'), (172, 188, 'DATE')]
[(549, 553, 'AI'), (1383, 1412, 'PPV'), (228, 238, 'DATE')]
[(835, 864, 'PPV'), (868, 872, 'PPVm'), (1105, 1115, 'DATE'), (717, 719, 'AG'), (1787, 1796, 'NCAD')]
[(1120, 1142, 'TOUS'), (1269, 1273, 'AG'), (1278, 1290, 'DATE'), (1331, 1335, 'AG')]
[(708, 710, 'NCAD'), (842, 844, 'NCAD AG'), (1002, 1004, 'AG CAD')]
[(38, 67, 'PPV'), (458, 484, 'PPV'), (520, 525, 'PPVm'), (674, 679, 'NCAD AG'), (769, 785, 'DATE'), (1553, 1555, 'AI CAD'), (830, 833, 'NCAD AG'), (10

In [22]:
#cleaning function : 
def clean_dataset(data):
    """
    This function cleans the dataset by removing rows with missing values and dropping the "label" column.
    It also renames the "label_dummy" column to "label".

    Args:
        data (DataFrame): The DataFrame containing the text, label and label_dummy columns.
    """
    data.dropna(axis=0, how='any', inplace=True)
    # Now we can drop the "label" column and rename the "label_dummy" column to "label"
    if 'label_dummy' in data.columns:
        data.drop("label", axis=1, inplace=True)
        data.rename(columns={"label_dummy": "label"}, inplace=True)
    else:
        pass
    print(data.head())
    return data

df = clean_dataset(df)
df.to_csv(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\data\training_csv\data449_cleaned.csv", index=False)

                                                text  label
0  l’enveloppe globale d’augmentation des rémunér...      0
1  salariés cadres  il est rappelé que les salari...      0
2  nao. informations relatives à la négociation  ...      0
3  une prime de salissure pour les soudeurs et mé...      0
4  negociation annuelle obligatoire. titre i - re...      0


In [23]:
#We inverse the label to have 0 for PPV and 1 for the rest
def inverse_label(data):
    """
    This function inverses the label column.

    Args:
        data (DataFrame): The DataFrame containing the text and label columns.
    """
    data["label"] = data["label"].apply(lambda x: 0 if x == 1 else 1)
    print(data["label"].value_counts())
    return data

# df = inverse_label(df)
# df.head()

In [24]:
# Create the dataset
dataset = list(df[["text", "label"]].sample(frac=1).itertuples(index=False, name=None))
dataset

[("rémunérations pour le personnel ouvrier  dans les mêmes dispositions que la nao nationale, concernant l’ensemble des salariés de la sade, la prise en charge de la mutuelle “mieux etre” par l’employeur passe de 35€ à 65€ par mois pour les salariés du régime général, de tel sorte que la cotisation d’un salarié isolé soit égale à 0. cela représente en moyenne une augmentation annuelle de 1.5% pour un ouvrier. l’enveloppe globale allouée aux augmentations des ouvriers (hors alternants) et entrés à la sade normandie avant le 1er juillet 2022, pour l’année 2023, hors promotions individuelles impliquant un changement significatif de responsabilités, se décompose de la façon suivante\xa0: pour le personnel ouvrier\xa0: une augmentation générale de 3.20% une enveloppe d'augmentations individuelles de 0.80%  les augmentations individuelles sont octroyées, sur proposition hiérarchique, en fonction des compétences individuelles du salarié, de son implication et de sa performance personnelle. le

Now that our dataset is cleaned, we can split it into 3: the train, the validation and the test data

In [25]:
import numpy as np

def split_data(data, train_ratio=0.75, val_ratio=0.15, test_ratio=0.10, random_seed=None):
    """
    Split a dataset into training, validation, and test sets.

    Parameters:
    - data: The dataset to be split.
    - train_ratio: The ratio of data to be allocated to the training set (default: 0.75).
    - val_ratio: The ratio of data to be allocated to the validation set (default: 0.15).
    - test_ratio: The ratio of data to be allocated to the test set (default: 0.10).
    - random_seed: Seed for the random shuffling (default: None, which results in non-reproducible shuffling).

    Returns:
    - A tuple containing three sets: (train_set, val_set, test_set)
    """
    # Calculate the total size of the dataset
    total_size = len(data)
    
    # Calculate the sizes of each split
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    test_size = total_size - train_size - val_size
    
    # Set the random seed if provided
    if random_seed is not None:
        np.random.seed(random_seed)
    
    # Shuffle the data
    shuffled_data = np.random.permutation(data)
    shuffled_data =list(map(lambda x:(str(x[0]),int(x[1])),shuffled_data))

    # Split the data into three sets
    train_set = shuffled_data[:train_size]
    val_set = shuffled_data[train_size:train_size + val_size]
    test_set = shuffled_data[train_size + val_size:]
    
    # Print the size of each set
    print("Training set size:", len(train_set))
    print("Validation set size:", len(val_set))
    print("Test set size:", len(test_set))

    return train_set, val_set, test_set

# Example usage:
train_data, val_data, test_data = split_data(dataset)

Training set size: 114
Validation set size: 22
Test set size: 17


Spacy for training need binary files (doc) so we need to convert the data to this format

We build the model to obtain a classifier

In [26]:
import spacy

# We create an empty model
nlp = spacy.blank("fr")

# We add the text classifier to the pipeline
# textcat = nlp.add_pipe("tok2vec")
# textcat = nlp.add_pipe("textcat")
# textcat.add_label("PPV")
# textcat.add_label("NPPV")

print(nlp.pipe_names)

[]


In [27]:
from spacy.tokens import DocBin

def convert(data, outfile):
    db = spacy.tokens.DocBin()

    for doc, label in nlp.pipe(data, as_tuples=True):

        doc.cats["PPV"] = label == 0
        doc.cats["NPPV"] = label == 1
     
        db.add(doc)
    
    db.to_disk(outfile)
    print("Data saved to:", outfile)

convert(train_data, r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\train.spacy")
convert(val_data, r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\val.spacy")
convert(test_data, r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\test.spacy")


Data saved to: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\train.spacy
Data saved to: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\val.spacy
Data saved to: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\test.spacy


In [28]:
!python -m spacy init config --lang fr --pipeline textcat --optimize efficiency --force C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\PPV\config.cfg 

⚠ To generate a more effective transformer-based config (GPU-only), install the
spacy-transformers package and re-run this command. The config generated now
does not use transformers.
ℹ Generated config template specific for your use case
- Language: fr
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
✔ Auto-filled config with all values
✔ Saved config
C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\PPV\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [29]:
!python -m spacy train C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\config.cfg --paths.train C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\train.spacy  --paths.dev C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\val.spacy --output C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV --verbose

ℹ Saving to output directory:
C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['textcat']
ℹ Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       50.00    0.50
  1     200          0.13       50.00    0.50
  3     400          0.00       50.00    0.50
  5     600          0.00       50.00    0.50
  7     800          0.00       50.00    0.50
  8    1000          0.00       50.00    0.50
 10    1200          0.00       50.00    0.50
 12    1400          0.00       50.00    0.50
 14    1600          0.00       50.00    0.50
✔ Saved pipeline to output directory
C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\model-last


[2023-10-02 14:37:54,802] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2023-10-02 14:38:16,224] [INFO] Set up nlp object from config
[2023-10-02 14:38:16,270] [DEBUG] Loading corpus from path: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\val.spacy
[2023-10-02 14:38:16,279] [DEBUG] Loading corpus from path: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\train.spacy
[2023-10-02 14:38:16,279] [INFO] Pipeline: ['textcat']
[2023-10-02 14:38:16,293] [INFO] Created vocabulary
[2023-10-02 14:38:16,294] [INFO] Finished initializing nlp object
[2023-10-02 14:38:23,044] [INFO] Initialized pipeline components: ['textcat']
[2023-10-02 14:38:23,090] [DEBUG] Loading corpus from path: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\val.spacy
[2023-10-02 14:38:23,097] [DEBUG] Loading corpus from path: C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\train.spacy
[2023-10-02 

In [30]:
!python -m spacy evaluate C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\model-best C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\OUV\test.spacy

ℹ Using CPU
[1m

TOK                 100.00
TEXTCAT (macro F)   50.00 
SPEED               47770 

[1m

            P        R        F
PPV    100.00   100.00   100.00
NPPV     0.00     0.00     0.00

[1m

       ROC AUC
PPV       None
NPPV      None



Test

In [31]:
texts = ["Le contrat ne contient pas de prime de partage de la valeur ajoutée"]

In [32]:
nlp = spacy.load(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\PPV\model-best")

for text in texts:
    doc = nlp(text)
    print(doc.cats,  "-",  text)

{'PPV': 0.8197599053382874, 'NPPV': 0.18024009466171265} - Le contrat ne contient pas de prime de partage de la valeur ajoutée


## Demo

Demonstration: on teste la capacité de tri sur un ensemble de documents

In [33]:
import pandas as pd
import spacy

# Load the model
nlp = spacy.load(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\model\classifyer\PPV\model-best")

# Load the data
data = pd.read_csv(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\data\training_csv\data449_cleaned.csv")
data.head()

Unnamed: 0,text,label
0,l’enveloppe globale d’augmentation des rémunér...,0
1,salariés cadres il est rappelé que les salari...,0
2,nao. informations relatives à la négociation ...,0
3,une prime de salissure pour les soudeurs et mé...,0
4,negociation annuelle obligatoire. titre i - re...,0


In [34]:
texts = data["text"].tolist()

for text in texts:
    doc = nlp(text)
    print(doc.cats,  "-",  text)

{'PPV': 1.5559260646114126e-05, 'NPPV': 0.9999843835830688} - l’enveloppe globale d’augmentation des rémunérations 2023 est fixée à 5,6 %.article i - : augmentations individuelles l’enveloppe des augmentations individuelles, fixée au 1er janvier 2023, correspond à : 0,5 % pour les salariés employés, ouvriers et maîtrises de qualification ; 1,1 % pour les salariés maîtrises d’encadrement ; 1,35 % pour les salariés cadres. article ii – : augmentations collectives ii – a : employés, ouvriers et maîtrises de qualification les augmentations collectives, au titre de 2023, sont de 5,1 %, décomposées ainsi : 0,35 % au titre de l’ancienneté, résultant des dispositions de l’article 45 de la convention collective inter-entreprises du 1er juin 1979 ; 3,9 % au titre de l’augmentation de la valeur du point, qui sera portée à 6,9157 € ; 0,85 % au titre de la revalorisation de l’indice de base. ii – b : maîtrises d’encadrement les augmentations collectives, au titre de 2023, sont de 4,5 %, décomposées

In [35]:
ppv_list = []
nppv_list = []

for text in texts:
    doc = nlp(text)
    if doc.cats.get("PPV", 0.0) > doc.cats.get("NPPV", 0.0):
        ppv_list.append(text)
    else:
        nppv_list.append(text)

In [36]:
df_ppv = pd.DataFrame(ppv_list, columns=["PPV"])
df_nppv = pd.DataFrame(nppv_list, columns=["NPPV"])

df_data = pd.concat([df_ppv, df_nppv], axis=1)
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PPV     21 non-null     object
 1   NPPV    132 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB


In [37]:
df_data.head()

Unnamed: 0,PPV,NPPV
0,accord. article 3 – dispositions salariales le...,l’enveloppe globale d’augmentation des rémunér...
1,negociation annuelle obligatoire. 1. remunerat...,salariés cadres il est rappelé que les salari...
2,negociation annuelle 2023. il a été convenu et...,nao. informations relatives à la négociation ...
3,negociation annuelle obligatoire 2023 2. augm...,une prime de salissure pour les soudeurs et mé...
4,il est convenu ce qui suit avec les organisati...,negociation annuelle obligatoire. titre i - re...


In [15]:
import sys
sys.path.append(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\script")
from confusion_matrix_spacy_def import print_statistics, import_label_studio_data, spacy_to_dataframe, dummy_label_true, clean_dataset, dummy_label_pred

true_data = import_label_studio_data(r"C:\Users\garsonj\Desktop\spacy_finetuning\spacy_files\data\training_json\data449.json", target_labels)
df_true = spacy_to_dataframe(true_data)
df_true = dummy_label_true(df_true)
df_true = clean_dataset(df_true)

# for the predicted json
with open(r"", "r", encoding="utf-8") as f:
    data_pred = json.load(f)

df_pred = spacy_to_dataframe(data_pred)
df_pred = dummy_label_pred(df_pred)
df_pred = clean_dataset(df_pred)


