In [None]:
import pandas as pd
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
with open("path to Artifacts/Experiments and Results/CPA_all.pkl", "rb") as f:
    predictions = pickle.load(f)  

In [None]:
len(predictions)

In [None]:
predictions

In [None]:
ground_truth = pd.read_csv("path to Benchmark\\CPA_Test\\CPA_test_gt.csv")


In [None]:
len(ground['label_gt'].unique())

In [None]:
ground_truth

In [None]:
ground_truth['label'].unique()

In [None]:
# Chargement des relations d'équivalence et subPropertyOf
import pandas as pd

mapping_df = pd.read_excel("path to Artifacts/Experiments and Results/cpa_to_schema_org_mapping.xlsx")  

def normalize(prop):
    """Extrait la dernière partie d'une URI ou garde tel quel, en minuscules"""
    if isinstance(prop, str):
        return prop.strip().split("/")[-1].lower()
    return ""

equivalents = {}
sub_properties = {}

for _, row in mapping_df.iterrows():
    cpa_label = normalize(row["cpa_label"])
    
    equiv = normalize(row.get("equivalentProperty", ""))
    if equiv:
        equivalents.setdefault(cpa_label, set()).add(equiv)
        equivalents.setdefault(equiv, set()).add(cpa_label)  # symétrique

    sub = normalize(row.get("subPropertyOf", ""))
    if sub:
        sub_properties.setdefault(cpa_label, set()).add(sub)



In [None]:

detailed_results = []

true_positives = 0
total_gt = 0
total_predicted = 0
top1_hits = 0
top3_hits = 0

for _, row in ground_truth.iterrows():
    table_name = row["table_name"]
    main_col = int(row["main_column_index"])
    col = int(row["column_index"])
    label_gt = row["label"].strip().lower()

    table_annotations = predictions.get(table_name, {})
    relation_dict = table_annotations.get((main_col, col), {})

    predicted_relations = list(relation_dict.keys())
    predicted_relations_lower = [r.lower() for r in predicted_relations]

    # tri des relations par proba 
    sorted_rels = sorted(
        relation_dict.items(),
        key=lambda x: x[1].get("probability", 0),
        reverse=True
    )
    sorted_relation_names = [r[0].lower() for r in sorted_rels]

    # match = label_gt in predicted_relations_lower
    match = False
    equiv_set = equivalents.get(label_gt, set())
    sub_set = sub_properties.get(label_gt, set())

    for pred in predicted_relations_lower:
        if pred == label_gt:
            match = True
            break
        if pred in equiv_set:
            match = True
            break
        if label_gt in sub_properties.get(pred, set()):  # predicted ⊆ ground truth
            match = True
            break

    top1 = label_gt == sorted_relation_names[0] if sorted_relation_names else False
    top3 = label_gt in sorted_relation_names[:3]

    detailed_results.append({
        "table_name": table_name,
        "main_col": main_col,
        "col": col,
        "label_gt": label_gt,
        "predicted_relations": predicted_relations,
        "match": match,
        "top1_match": top1,
        "top3_match": top3,
    })

    total_gt += 1
    total_predicted += len(predicted_relations)
    if match:
        true_positives += 1
    if top1:
        top1_hits += 1
    if top3:
        top3_hits += 1

# --- Scores globaux ---
recall = true_positives / total_gt if total_gt else 0
precision = true_positives / total_predicted if total_predicted else 0
f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
top1_acc = top1_hits / total_gt
top3_acc = top3_hits / total_gt

# --- Affichage ---
print("== Résultats globaux ==")
print("true_positives :", true_positives)
print("total_gt :", total_gt)
print("total_predicted :", total_predicted)
print(f"Recall : {recall:.3f}")
print(f"Precision : {precision:.3f}")
print(f"F1-score : {f1:.3f}")
print(f"Top-1 Accuracy : {top1_acc:.3f}")
print(f"Top-3 Accuracy : {top3_acc:.3f}")

# # --- Sauvegarde optionnelle ---
# df_results = pd.DataFrame(detailed_results)
# df_results.to_csv("evaluation_détaillée_relations.csv", index=False)


In [None]:
# --- Scores Top-1 comparables à Doduo ---
tp_top1 = 0
fp_top1 = 0
fn_top1 = 0

for res in detailed_results:
    gt = res["label_gt"]
    sorted_preds = [r.lower() for r in res["predicted_relations"]]  

    if not sorted_preds:  
        # Pas de prédiction = faux négatif
        fn_top1 += 1
        continue

    top1_pred = sorted_preds[0]  # la relation avec la proba max

    if gt == top1_pred:
        tp_top1 += 1
    else:
        fp_top1 += 1
        fn_top1 += 1

precision_top1 = tp_top1 / (tp_top1 + fp_top1) if (tp_top1 + fp_top1) else 0
recall_top1 = tp_top1 / (tp_top1 + fn_top1) if (tp_top1 + fn_top1) else 0
f1_top1 = 2 * precision_top1 * recall_top1 / (precision_top1 + recall_top1) if (precision_top1 + recall_top1) else 0

print("\n== Résultats comparables à Doduo (Top-1) ==")
print("TP :", tp_top1)
print("FP :", fp_top1)
print("FN :", fn_top1)
print(f"Precision Top-1 : {precision_top1:.3f}")
print(f"Recall Top-1    : {recall_top1:.3f}")
print(f"F1-score Top-1  : {f1_top1:.3f}")


In [None]:
from collections import defaultdict

detailed_results = []
true_positives = {"LM": 0, "KB": 0}
total_gt = 0
total_predicted = {"LM": 0, "KB": 0}
top1_hits = {"LM": 0, "KB": 0}
top3_hits = {"LM": 0, "KB": 0}

for _, row in ground_truth.iterrows():
    table_name = row["table_name"]
    main_col = int(row["main_column_index"])
    col = int(row["column_index"])
    label_gt = row["label"].strip().lower()

    table_annotations = predictions.get(table_name, {})
    relation_dict = table_annotations.get((main_col, col), {})

    # --- Split relations by source ---
    rels_LM = {r: info for r, info in relation_dict.items() if "LM" in info.get("sources", [])}
    rels_KB = {r: info for r, info in relation_dict.items() if any("KB" in s for s in info.get("sources", []))}

    # Utility: function for evaluating a single source set
    def evaluate_source(source_name, rels):
        predicted_relations = list(rels.keys())
        predicted_relations_lower = [r.lower() for r in predicted_relations]

        sorted_rels = sorted(
            rels.items(),
            key=lambda x: x[1].get("probability", 0),
            reverse=True
        )
        sorted_relation_names = [r[0].lower() for r in sorted_rels]

        match = False
        equiv_set = equivalents.get(label_gt, set())
        sub_set = sub_properties.get(label_gt, set())

        for pred in predicted_relations_lower:
            if pred == label_gt:
                match = True
                break
            if pred in equiv_set:
                match = True
                break
            if label_gt in sub_properties.get(pred, set()):  # predicted ⊆ ground truth
                match = True
                break

        top1 = label_gt == sorted_relation_names[0] if sorted_relation_names else False
        top3 = label_gt in sorted_relation_names[:3]

        total_predicted[source_name] += len(predicted_relations)
        if match:
            true_positives[source_name] += 1
        if top1:
            top1_hits[source_name] += 1
        if top3:
            top3_hits[source_name] += 1

        detailed_results.append({
            "table_name": table_name,
            "main_col": main_col,
            "col": col,
            "label_gt": label_gt,
            "predicted_relations": predicted_relations,
            "source": source_name,
            "match": match,
            "top1_match": top1,
            "top3_match": top3,
        })

    # Evaluate separately for LM and KB
    evaluate_source("LM", rels_LM)
    evaluate_source("KB", rels_KB)

    total_gt += 1

# --- Compute metrics per source ---
for src in ["LM", "KB"]:
    recall = true_positives[src] / total_gt if total_gt else 0
    precision = true_positives[src] / total_predicted[src] if total_predicted[src] else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    top1_acc = top1_hits[src] / total_gt
    top3_acc = top3_hits[src] / total_gt

    print(f"== Résultats {src} ==")
    print(f"True positives : {true_positives[src]}")
    print(f"Total GT : {total_gt}")
    print(f"Total predicted : {total_predicted[src]}")
    print(f"Recall : {recall:.3f}")
    print(f"Precision : {precision:.3f}")
    print(f"F1-score : {f1:.3f}")
    print(f"Top-1 Accuracy : {top1_acc:.3f}")
    print(f"Top-3 Accuracy : {top3_acc:.3f}")
    print("-" * 40)


In [None]:
from collections import defaultdict

# kb_predictions_per_pair : pour stocker ce que KB prédit par paire de colonnes
kb_predictions_per_pair = defaultdict(list)

for table_name, table_annotations in predictions.items():
    for (main_col, col), relation_dict in table_annotations.items():
        # On ne prend que les relations provenant de KB
        kb_rels = [r for r, info in relation_dict.items() if any("KB" in s for s in info.get("sources", []))]
        if kb_rels:  # si la KB a produit quelque chose
            # Ground truth pour cette paire
            gt_row = ground_truth[(ground_truth["table_name"] == table_name) &
                                  (ground_truth["main_column_index"] == main_col) &
                                  (ground_truth["column_index"] == col)]
            gt_rel = gt_row["label"].values[0] if not gt_row.empty else None

            kb_predictions_per_pair[(table_name, main_col, col)].append({
                "ground_truth": gt_rel,
                "kb_predictions": kb_rels
            })

# --- Affichage ---
for (table_name, main_col, col), infos in kb_predictions_per_pair.items():
    for info in infos:
        print(f"Table: {table_name}, Main_col: {main_col}, Col: {col}")
        print(f"  Ground truth: {info['ground_truth']}")
        print(f"  KB predictions: {info['kb_predictions']}")
        print("----------------------------------------------------")


In [None]:
from collections import defaultdict, Counter

total_pairs_kb = 0
correct_pairs = 0
pairs_outside_gt = 0
extra_when_correct = []

kb_extra_per_table = defaultdict(list)
kb_predictions_outside_gt = Counter()

for table_name, table_annotations in predictions.items():
    for (main_col, col), relation_dict in table_annotations.items():
        # Prendre uniquement les relations KB
        kb_rels = [r for r, info in relation_dict.items() if any("KB" in s for s in info.get("sources", []))]
        if kb_rels:
            total_pairs_kb += 1

            # Ground truth pour cette paire
            gt_row = ground_truth[(ground_truth["table_name"] == table_name) &
                                  (ground_truth["main_column_index"] == main_col) &
                                  (ground_truth["column_index"] == col)]
            gt_rel = gt_row["label"].values[0] if not gt_row.empty else None

            # Vérifier si KB a prédit correctement
            if gt_rel and gt_rel.lower() in [r.lower() for r in kb_rels]:
                correct_pairs += 1
                # compter combien de relations supplémentaires hors GT
                extra_count = len([r for r in kb_rels if r.lower() != gt_rel.lower()])
                extra_when_correct.append(extra_count)
            else:
                pairs_outside_gt += 1
                kb_predictions_outside_gt.update(kb_rels)
                kb_extra_per_table[table_name].append((main_col, col, kb_rels))

# --- Affichage des quantifications ---
print(f"Total paires annotées par KB : {total_pairs_kb}")
print(f"Paires correctes (match GT) : {correct_pairs}")
print(f"Paires hors ground truth : {pairs_outside_gt}")
print(f"Précision locale KB : {correct_pairs / total_pairs_kb * 100:.2f}%")

mean_extra_per_table = sum(len(r) for r in kb_extra_per_table.values()) / len(kb_extra_per_table)
print(f"Nombre moyen de relations hors GT par table : {mean_extra_per_table:.2f}")

# Moyenne de relations supplémentaires par paire correcte
mean_extra_correct = sum(extra_when_correct) / len(extra_when_correct) if extra_when_correct else 0
print(f"Moyenne de relations supplémentaires KB par paire correcte : {mean_extra_correct:.2f}")

print("\nTop 20 relations KB hors GT :")
for rel, count in kb_predictions_outside_gt.most_common(20):
    print(f"{rel}: {count}")
