In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import json
from collections import Counter
import ast

import numpy as np
from numpy.typing import NDArray
from typing import List, Tuple

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
def compute_tp_fp(true_labels, predicted_labels):
    true_labels = ast.literal_eval(true_labels)  # Convert to list
    predicted_labels = ast.literal_eval(predicted_labels)

    true_counts = Counter(true_labels)
    pred_counts = Counter(predicted_labels)

    tp_counts = Counter(t for t, p in zip(true_labels, predicted_labels) if t == p)
    fp_counts = Counter(p for t, p in zip(true_labels, predicted_labels) if t != p)

    results = []
    for label in true_counts:
        tp_rate = (tp_counts[label] / true_counts[label]) * 100 if true_counts[label] else 0
        fp_rate = (fp_counts[label] / pred_counts[label]) * 100 if pred_counts[label] else 0
        results.append((label, tp_rate, fp_rate))
    
    return results

In [3]:

name = "../library_test/credentials/identity_attributes_driving_license.csv"
df_labels = pd.read_csv(name)
df_labels["JSON information"] = df_labels["JSON information"].apply(lambda x: json.loads(x))
df_labels["number_attributes"]= 12+df_labels["Number of general additional informations"]+3*df_labels["Number of categories"]+df_labels["Number of additional inforations of each categories"].apply(lambda x: sum(map(int, x.split(','))))

In [6]:
# Process all rows
tp_fp_results = []
score_results = []
name_folder = "model trial 4 100 redo"
runs = [2,3,4,5,12,13,14,15,16,17,18]

for i in range(1):
    name_csv = f"model_run{i+1}_simplified.csv"
    df = pd.read_csv(f"{name_folder}/{name_csv}", index_col=0)

    for j, row in df.iterrows():
        # Recovery of the True Positive and False Positive for each label based on its regex and models
        row_results = compute_tp_fp(row["test_labels"], row["predictions"])
        for label, tp_rate, fp_rate in row_results:
            tp_fp_results.append({
                "row": j,
                "type_model": row["type_model"],
                "regex": row["regex"],
                "Personal Number": label,
                "TP%": tp_rate,
                "FP%": fp_rate
            })
        # Recovery of the score of each models
        score_results.append({
            "type_model": row["type_model"],
            "regex": row["regex"],
            "score": row["score"],
            "score2": row["score2"],
            "score10": row["score10"]
        })
        

# Convert to DataFrame
df_tp_fp_results = pd.DataFrame(tp_fp_results).merge(df_labels[["Personal Number", "number_attributes"]], on="Personal Number", how="left")
df_score_results =  pd.DataFrame(score_results)

# Group by type_model, regex, and Personal Number, then compute mean and variance
df_tp_fp_results_groupby = df_tp_fp_results.groupby(["type_model", "regex", "Personal Number"]).agg(
    TP_Mean=("TP%", "mean"),
    TP_Var=("TP%", "var"),
    FP_Mean=("FP%", "mean"),
    FP_Var=("FP%", "var")
).reset_index()

df_tp_fp_results_groupby_number_attribute_only = df_tp_fp_results.groupby(["type_model", "regex", "number_attributes"]).agg(
    TP_Mean=("TP%", "mean"),
    TP_Var=("TP%", "var"),
    FP_Mean=("FP%", "mean"),
    FP_Var=("FP%", "var")
).reset_index()

# Group by type_model, regex, and Personal Number, then compute mean and variance
df_score_results_groupby = df_score_results.groupby(["type_model", "regex"]).agg(
    Score_Mean=("score", "mean"),
    Score_Var=("score", "var"),
    Score2_Mean=("score2", "mean"),
    Score2_Var=("score2", "var"),
    Score10_Mean=("score10", "mean"),
    Score10_Var=("score10", "var")
).reset_index()


df_tp_fp_results_groupby_number_attribute_only.to_csv(f"{name_folder}/df_tp_fp_results_groupby_number_attribute_only.csv")
df_tp_fp_results_groupby.to_csv(f"{name_folder}/df_tp_fp_results_groupby.csv")
df_score_results_groupby.to_csv(f"{name_folder}/df_score_results_groupby.csv")