# Confusion Matrix Code

In [18]:
import spacy
import json
import numpy as np
import pandas as pd
import os

## True dataset

We build the data base to be treated

In [19]:
# thanks to https://www.kaggle.com/code/kiruthigaa/ner-model-train-test-using-spacy-label-studio#Convert-Label-studio-data-to-spacy-NER-format

def import_label_studio_data(filename):
    TRAIN_DATA = []
    with open(filename,'rb') as fp:
        training_data = json.load(fp)
    for text in training_data:
        entities = []
        info = text.get('text')
        entities = []
        if text.get('label') is not None:
            list_ = []
            for label in text.get('label'):
                list_.append([label.get('start'), label.get('end')])
            a = np.array(list_)
            overlap_ind =[]
            for i in range(0,len(a[:,0])):
                a_comp = a[i]
                x = np.delete(a, (i), axis=0)
                overlap_flag = any([a_comp[0] in range(j[0], j[1]+1) for j in x])
                if overlap_flag:
                    overlap_ind.append(i)
                    
            for ind, label in enumerate(text.get('label')):
                if ind in overlap_ind:
                    iop=0
                else:
                    if label.get('labels') is not None:
                        entities.append((label.get('start'), label.get('end') ,label.get('labels')[0]))
        TRAIN_DATA.append((info, {"entities" : entities}))
    return TRAIN_DATA

In [20]:
data_true = import_label_studio_data(r"../data/raw/data449.json")

In [21]:
# We make a data frame out of it with 1 when the label is present and 0 when it is not

def spacy_to_dataframe(data):
    """
    This function takes the data in the format returned by the import_label_studio_data function and returns a pandas dataframe of two columns: text and label.

    Args:
        data: The data in the format returned by the import_label_studio_data function.

    Returns:
        A pandas dataframe of two columns: text and label.
    """
    text_data = [text for text, _ in data]
    labels = [label for _, label in data]

    df = pd.DataFrame({'text': text_data, 'label': labels})
    return df

# text_data = [text for text, _ in data]
# labels = [label for _, label in data]

# df = pd.DataFrame({'text': text_data, 'label': labels})

df_true = spacy_to_dataframe(data_true)
df_true.tail()

Unnamed: 0,text,label
444,negociation annuelle 2022. il a été convenu et...,"{'entities': [(98, 106, 'OUV'), (325, 329, 'AG..."
445,négociations annuelles obligatoires. ii- dispo...,"{'entities': [(217, 221, 'AG'), (258, 287, 'TO..."
446,"accord collectif 2022 sur les salaires, la dur...","{'entities': [(192, 198, 'ENT'), (271, 280, 'D..."
447,damart sa etablissement. article i : augmentat...,"{'entities': [(0, 6, 'ENT'), (114, 122, 'OUV')..."
448,"entre l’ues kiabi, représentée par, directeur...","{'entities': [(13, 18, 'ENT'), (37, 70, 'DIR')..."


In [22]:
def dummy_label(df):
    """
    This function creates a dummy variable for the target label.

    Args:
        df (DataFrame): The DataFrame containing the text and label columns.
    """
    # Create a new column called "label_dummy" and initialize with zeros
    df["label_dummy"] = 0

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        labels = row["label"]["entities"]  # Access the entities list in the tuple
        for label in labels:
            target = label[2]
            if target == "PPV":
                df.at[index, "label_dummy"] = 1  # Set the value to 1 for the current row

    # Print the DataFrame to verify the changes
    print(df["label_dummy"].value_counts())
    return df

def clean_dataset(data):
    """
    This function cleans the dataset by removing rows with missing values and dropping the "label" column.
    It also renames the "label_dummy" column to "label".

    Args:
        data (DataFrame): The DataFrame containing the text, label and label_dummy columns.
    """
    data.dropna(axis=0, how='any', inplace=True)
    # Now we can drop the "label" column and rename the "label_dummy" column to "label"
    if 'label_dummy' in data.columns:
        data.drop("label", axis=1, inplace=True)
        data.rename(columns={"label_dummy": "label"}, inplace=True)
    else:
        pass
    print(data.head())
    return data

df_true = dummy_label(df_true)
df_true = clean_dataset(df_true)

label_dummy
0    355
1     94
Name: count, dtype: int64
                                                text  label
0  evolution des salaires de base : enveloppe bud...      0
1  l’enveloppe globale d’augmentation des rémunér...      0
2  dispositions au regard de l’implication de tou...      1
3  nous travaillons sur une politique de rémunéra...      1
4  protocole d’accord négociation annuelle obliga...      0


## Predicted dataset

Now we do the data set with the predicted answer

In [None]:
with open(os.path.join(ROOT_PATH, r"data\predicted_json\labeled_data_PPV.json"), "r", encoding="utf-8") as f:
    data_pred = json.load(f)

In [None]:
def dummy_label_pred(df):
    """
    This function creates a dummy variable for the target label.

    Args:
        df (DataFrame): The DataFrame containing the text and label columns.
    """
    # Create a new column called "label_dummy" and initialize with zeros
    df["label_dummy"] = 0

    for index, row in df_pred.iterrows():
        label_data = row['label']
        if label_data == {'label': []}:
            df_pred.at[index, 'label_dummy'] = 0
        else:
            df_pred.at[index, 'label_dummy'] = 1

    # Print the DataFrame to verify the changes
    print(df["label_dummy"].value_counts())
    return df



df_pred = spacy_to_dataframe(data_pred)
df_pred = dummy_label_pred(df_pred)
df_pred = clean_dataset(df_pred)
df_pred.head()

We order the two data sets

In [None]:
df_whole = pd.merge(left=df_true, right=df_pred, on='text', how="inner")

## Plotting the matrix of confusion

In [None]:
true_label_data = df_whole['label_x'].tolist()
pred_label_data = df_whole['label_y'].tolist()

In [None]:
print(len(true_label_data))
print(len(pred_label_data))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(true_label_data, pred_label_data)

# Assuming you have already created the heatmap
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Adjust font size if needed
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', xticklabels=['0', 'PPV'], yticklabels=['0', 'PPV'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Print the values
print(confusion)


In [None]:
type(confusion)

We display the key statistics

In [None]:
# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)