In [1]:
pip install pandas networkx matplotlib



In [2]:
pip install rapidfuzz



In [3]:
# Mapping-candidates generator for manual mapping
# Run `!pip install rapidfuzz` in Colab if you want fuzzy suggestions.

import pandas as pd
import unicodedata
from pathlib import Path

try:
    from rapidfuzz import process, fuzz
    HAVE_RAPIDFUZZ = True
except Exception:
    HAVE_RAPIDFUZZ = False

GLOTTO_PATH = "/content/unbounded_language_family_trees.csv"   # change if needed
EXTRACTED_PATH = "/content/language_relationships.csv"        # change if needed
TOP_K = 10            # number of fuzzy candidates to keep
FUZZY_THRESHOLD = 60  # include fuzzy candidates with score >= this (only used if rapidfuzz available)
OUT_CSV = "/content/mapping_candidates.csv"

def norm(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return s.lower()

def canonicalize_text(s):
    if not s:
        return s
    t = " " + s + " "
    tokens = [" dialect", " dialects", " languages", " language", " regional", " variety",
              " var ", " old ", " early ", " late ", " middle ", " modern ", "proto ",
              "post-", "pre-", " epic ", " stage ", "text", "spoken", "historical"]
    for token in tokens:
        t = t.replace(token, " ")
    for ch in "(),/;:_-":
        t = t.replace(ch, " ")
    t = " ".join(t.split())
    return t.strip()

# load
gdf = pd.read_csv(GLOTTO_PATH)
edf = pd.read_csv(EXTRACTED_PATH)

# normalize column names
gdf.columns = [c.strip() for c in gdf.columns]
edf.columns = [c.strip() for c in edf.columns]
def find_col(cols, candidates):
    for c in cols:
        if c.lower() in candidates:
            return c
    return None

g_child_c = find_col(gdf.columns, ['child','child_name','childname'])
g_parent_c = find_col(gdf.columns, ['parent','parent_name','parentname'])
e_child_c = find_col(edf.columns, ['child','child_name','childname'])
e_parent_c = find_col(edf.columns, ['parent','parent_name','parentname'])

if not g_child_c or not g_parent_c or not e_child_c or not e_parent_c:
    raise ValueError("Couldn't find child/parent columns in inputs. Glottolog cols: {}, Extracted cols: {}".format(gdf.columns, edf.columns))

gdf = gdf.dropna(subset=[g_child_c, g_parent_c]).copy()
edf = edf.dropna(subset=[e_child_c, e_parent_c]).copy()

# normalized node lists
gdf['child_n'] = gdf[g_child_c].map(norm)
gdf['parent_n'] = gdf[g_parent_c].map(norm)

# build set of unique glottolog normalized node names and also map to originals
gl_nodes = set(gdf['child_n']).union(set(gdf['parent_n']))
orig_map = {}
for _, row in gdf.iterrows():
    n = norm(row[g_child_c])
    orig_map.setdefault(n, set()).add(row[g_child_c])
    n2 = norm(row[g_parent_c])
    orig_map.setdefault(n2, set()).add(row[g_parent_c])
# collapse originals to one string for display
orig_map_str = {k: " | ".join(sorted(list(v))[:5]) for k,v in orig_map.items()}

# extracted nodes
edf['child_n'] = edf[e_child_c].map(norm)
edf['parent_n'] = edf[e_parent_c].map(norm)
all_extracted_nodes = list(pd.Series(list(edf['child_n']) + list(edf['parent_n'])).unique())

# find which extracted nodes are exact in Glottolog
exact_mapped = {n for n in all_extracted_nodes if n in gl_nodes}
unmapped_nodes = [n for n in all_extracted_nodes if n not in exact_mapped and n != ""]

# precompute glottolog list for fuzzy / substring
gl_list = sorted(list(gl_nodes), key=lambda x: -len(x))

rows = []
for en in unmapped_nodes:
    # counts: how often this token appears in extracted data (to prioritize)
    count = int(((edf['child_n'] == en).sum()) + ((edf['parent_n'] == en).sum()))
    cand_set = []
    # canonical exact
    en_can = canonicalize_text(en)
    if en_can and en_can in gl_nodes:
        cand_set.append((en_can, "canonical_exact", 95, orig_map_str.get(en_can, en_can)))
    # substring matches (gl node contains en or en contains gl node)
    substr_matches = []
    for g in gl_list:
        if en in g or g in en:
            substr_matches.append((g, "substring", 90, orig_map_str.get(g, g)))
            if len(substr_matches) >= TOP_K:
                break
    cand_set.extend(substr_matches)
    # fuzzy matches
    if HAVE_RAPIDFUZZ:
        fuzzy = process.extract(en, gl_list, scorer=fuzz.ratio, limit=TOP_K)
        for cand, score, _ in fuzzy:
            if score >= FUZZY_THRESHOLD:
                cand_set.append((cand, "fuzzy", int(score), orig_map_str.get(cand, cand)))
    # deduplicate preserving order
    seen = set()
    cands = []
    for gname, mtype, score, orig in cand_set:
        if gname not in seen:
            cands.append({'extracted_node': en, 'count': count, 'candidate': gname, 'candidate_orig_examples': orig,
                          'match_type': mtype, 'score': score})
            seen.add(gname)
    # if no candidates, still add a row with none
    if not cands:
        rows.append({'extracted_node': en, 'count': count, 'candidate': '', 'candidate_orig_examples': '',
                     'match_type': 'none', 'score': 0})
    else:
        rows.extend(cands)

candidates_df = pd.DataFrame(rows)
# sort so most frequent unmapped nodes appear first
candidates_df = candidates_df.sort_values(by=['count','score'], ascending=[False, False])

# save
candidates_df.to_csv(OUT_CSV, index=False)
print("Saved mapping candidates for unmapped extracted nodes to:", OUT_CSV)
print("Unmapped node count:", len(unmapped_nodes))
display(candidates_df.head(200))


Saved mapping candidates for unmapped extracted nodes to: /content/mapping_candidates.csv
Unmapped node count: 228


Unnamed: 0,extracted_node,count,candidate,candidate_orig_examples,match_type,score
1488,new indo-aryan languages,13,indo-aryan,Indo-Aryan,substring,90
1489,new indo-aryan languages,13,indo-aryan kinnauri,Indo-Aryan Kinnauri,fuzzy,60
319,south american spanish,9,spanish,Spanish,substring,90
320,south american spanish,9,southern mexican spanish,Southern Mexican Spanish,fuzzy,86
321,south american spanish,9,latin american spanish,Latin American Spanish,fuzzy,81
...,...,...,...,...,...,...
589,modern irish english,3,southern american english,Southern American English,fuzzy,66
949,rarhi,3,churahi,Churahi,fuzzy,66
1370,early middle japanese,3,old japanese,Old Japanese,fuzzy,66
7,west germanic languages,3,western aragonese,Western Aragonese,fuzzy,65


In [4]:
import pandas as pd

# Load the mapping candidates file
try:
    df = pd.read_csv('mapping_candidates.csv')
    print("✅ Successfully loaded mapping_candidates.csv")
except FileNotFoundError:
    print("❌ Error: mapping_candidates.csv not found.")
    exit()

# Apply the selection logic to filter for high-confidence mappings
# 1. All canonical_exact matches
exact_matches = df[df['match_type'] == 'canonical_exact']

# 2. Fuzzy matches with a score >= 90
fuzzy_matches = df[(df['match_type'] == 'fuzzy') & (df['score'] >= 90)]

# 3. Substring matches that appear more than once (higher confidence)
substring_matches = df[(df['match_type'] == 'substring') & (df['count'] > 1)]

# Combine the filtered dataframes
final_mappings_df = pd.concat([exact_matches, fuzzy_matches, substring_matches]).drop_duplicates(subset=['extracted_node'])

# Create the Python dictionary from the filtered dataframe
# It maps the 'extracted_node' (from your data) to the 'candidate' (from Glottolog)
mapping_dict = dict(zip(final_mappings_df['extracted_node'], final_mappings_df['candidate']))

# --- Display the generated dictionary code ---
print("\\n# --- High-Confidence Mapping Dictionary ---")
print("# This dictionary maps your extracted names to their suggested Glottolog counterparts.")
print("# It has been filtered to include only high-quality matches.")
print("mapping_dict = {")
for key, value in mapping_dict.items():
    print(f"    '{key}': '{value}',")
print("}")

print(f"\\n✅ Generated a dictionary with {len(mapping_dict)} high-confidence mappings.")

✅ Successfully loaded mapping_candidates.csv
\n# --- High-Confidence Mapping Dictionary ---
# This dictionary maps your extracted names to their suggested Glottolog counterparts.
# It has been filtered to include only high-quality matches.
mapping_dict = {
    'modern russian': 'russian',
    'germanic languages': 'germanic',
    'early middle english': 'english',
    'old east slavic': 'east slavic',
    'west germanic languages': 'west germanic',
    'romance languages': 'romance',
    'western romance languages': 'western romance',
    'modern irish english': 'irish english',
    'early middle japanese': 'japanese',
    'italic languages': 'italic',
    'castilian languages': 'castilian',
    'anglic languages': 'anglic',
    'old english': 'english',
    'northern middle english': 'northern english',
    'early modern english': 'english',
    'old french': 'french',
    'slavic languages': 'slavic',
    'east slavic languages': 'east slavic',
    'middle russian': 'russian',
    'l

In [5]:
# Row-by-row validator (clean notebook cell)
# Paste into Colab / Jupyter and run. No fuzzy matching used.
import pandas as pd
import networkx as nx
import unicodedata
from pathlib import Path

# ---------- CONFIG ----------
GLOTTO_PATH = "/content/unbounded_language_family_trees.csv"   # Glottolog CSV
EXTRACTED_PATH = "/content/language_relationships.csv"        # Your extracted relationships CSV
MANUAL_MAP_CSV = "/content/final_manual_map.csv"              # optional CSV with columns: extracted_node,candidate
OUT_ROWS_CSV = "/content/validation_row_by_row.csv"
OUT_SUMMARY_CSV = "/content/validation_summary_row_by_row.csv"
# ----------------------------

def norm(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return s.lower()

def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    colmap = {}
    for c in df.columns:
        cl = c.lower().strip()
        if cl in ('child_name','childname','child'):
            colmap[c] = 'child'
        elif cl in ('parent_name','parentname','parent'):
            colmap[c] = 'parent'
        elif 'relationship' in cl:
            colmap[c] = 'relationship'
        elif cl == 'language':
            colmap[c] = 'language'
        else:
            colmap[c] = cl
    return df.rename(columns=colmap)

# Load datasets
gdf = pd.read_csv(GLOTTO_PATH)
edf = pd.read_csv(EXTRACTED_PATH)
gdf = normalize_cols(gdf)
edf = normalize_cols(edf)

# Basic validation of expected columns
if 'child' not in gdf.columns or 'parent' not in gdf.columns:
    raise ValueError("Glottolog CSV missing child/parent columns. Found: " + str(list(gdf.columns)))
if 'child' not in edf.columns or 'parent' not in edf.columns:
    raise ValueError("Extracted CSV missing child/parent columns. Found: " + str(list(edf.columns)))

# Build Glottolog directed graph (child -> parent)
gdf = gdf.dropna(subset=['child','parent']).copy()
gdf['child_n'] = gdf['child'].map(norm)
gdf['parent_n'] = gdf['parent'].map(norm)
G_gl = nx.DiGraph()
G_gl.add_edges_from([(r.child_n, r.parent_n) for r in gdf.itertuples()])
glotto_nodes = set(G_gl.nodes)

# Prepare manual mapping dictionary:
# 1) If mapping_dict already present in the notebook, use it.
# 2) Else try to load a CSV MANUAL_MAP_CSV (columns: extracted_node,candidate).
# 3) Else empty dict.
manual_map = {}
if 'mapping_dict' in globals() and isinstance(mapping_dict, dict):
    # use user's mapping_dict (assume keys are raw strings, need normalization)
    for k,v in mapping_dict.items():
        if pd.isna(k) or pd.isna(v):
            continue
        manual_map[norm(k)] = norm(v)
    print("Using mapping_dict from notebook ({} entries)".format(len(manual_map)))
else:
    # try to load CSV
    try:
        mm_df = pd.read_csv(MANUAL_MAP_CSV)
        if 'extracted_node' in mm_df.columns and 'candidate' in mm_df.columns:
            for _, r in mm_df.iterrows():
                if pd.isna(r['extracted_node']) or pd.isna(r['candidate']):
                    continue
                manual_map[norm(r['extracted_node'])] = norm(r['candidate'])
            print("Loaded manual_map from", MANUAL_MAP_CSV, "({} entries)".format(len(manual_map)))
        else:
            print("MANUAL_MAP_CSV found but doesn't have columns 'extracted_node' and 'candidate'. Ignoring.")
    except FileNotFoundError:
        print("No mapping_dict in notebook and no manual_map CSV found. Continuing with empty manual_map.")

# Prepare extracted rows (keep all)
edf = edf.dropna(subset=['child','parent']).copy()
edf['child_orig'] = edf['child'].astype(str)
edf['parent_orig'] = edf['parent'].astype(str)
edf['child_n'] = edf['child_orig'].map(norm)
edf['parent_n'] = edf['parent_orig'].map(norm)

def map_name(name_n):
    """
    Map normalized extracted node name to a Glottolog node name (normalized).
    Order: exact -> manual_map -> none.
    (User requested to check mapping dict when exact not found; implemented accordingly.)
    Returns: (mapped_name or "", map_type)
    """
    if not name_n:
        return ("", "none")
    # exact match first
    if name_n in glotto_nodes:
        return (name_n, "exact")
    # manual map second (user-provided authoritative mapping)
    if name_n in manual_map:
        mapped = manual_map[name_n]
        if mapped in glotto_nodes:
            return (mapped, "manual")
        else:
            # manual mapping points to a name not present in Glottolog
            return (mapped, "manual_not_in_glottolog")
    # not found
    return ("", "none")

# Validate row-by-row
rows = []
for r in edf.itertuples(index=False):
    c_orig = r.child_orig
    p_orig = r.parent_orig
    c_n = r.child_n
    p_n = r.parent_n

    c_mapped, c_map_type = map_name(c_n)
    p_mapped, p_map_type = map_name(p_n)

    found_child = (c_mapped != "" and c_mapped in glotto_nodes)
    found_parent = (p_mapped != "" and p_mapped in glotto_nodes)
    note_items = []

    if not found_child:
        note_items.append("child_not_found")
    if not found_parent:
        note_items.append("parent_not_found")

    correct = False
    path = None
    if found_child and found_parent:
        # traverse upward from child to parent using Glottolog graph:
        try:
            if nx.has_path(G_gl, c_mapped, p_mapped):
                path = nx.shortest_path(G_gl, c_mapped, p_mapped)
                correct = True
            else:
                correct = False
        except Exception as e:
            # safety: if graph errors, mark not correct
            note_items.append("graph_error:{}".format(str(e)))
            correct = False

    if correct:
        note_items.append("correct")
    else:
        if found_child and found_parent:
            note_items.append("mapped_but_not_ancestor")

    rows.append({
        'child_orig': c_orig,
        'parent_orig': p_orig,
        'child_norm': c_n,
        'parent_norm': p_n,
        'child_mapped': c_mapped,
        'child_map_type': c_map_type,
        'parent_mapped': p_mapped,
        'parent_map_type': p_map_type,
        'found_child': found_child,
        'found_parent': found_parent,
        'correct': correct,
        'path': " > ".join(path) if path else "",
        'note': "; ".join(note_items)
    })

res_df = pd.DataFrame(rows)
res_df.to_csv(OUT_ROWS_CSV, index=False)

# Summary
total = len(res_df)
found_both = int(((res_df['found_child']) & (res_df['found_parent'])).sum())
correct_count = int(res_df['correct'].sum())
not_found_count = int(((~res_df['found_child']) | (~res_df['found_parent'])).sum())
summary = {
    'total_extracted_rows': total,
    'rows_with_both_endpoints_found_in_glottolog': found_both,
    'correct_relationships (ancestor found upward)': correct_count,
    'rows_with_missing_endpoint': not_found_count,
}
pd.DataFrame([summary]).to_csv(OUT_SUMMARY_CSV, index=False)

print("Saved per-row diagnostics to:", OUT_ROWS_CSV)
print("Saved summary to:", OUT_SUMMARY_CSV)
print()
print("SUMMARY:")
for k,v in summary.items():
    print(f"{k}: {v}")

# show first 30 problematic rows for quick inspection
print("\nSample problematic rows (first 30):")
display(res_df[res_df['correct'] == False].head(30))


Using mapping_dict from notebook (77 entries)
Saved per-row diagnostics to: /content/validation_row_by_row.csv
Saved summary to: /content/validation_summary_row_by_row.csv

SUMMARY:
total_extracted_rows: 295
rows_with_both_endpoints_found_in_glottolog: 132
correct_relationships (ancestor found upward): 89
rows_with_missing_endpoint: 163

Sample problematic rows (first 30):


Unnamed: 0,child_orig,parent_orig,child_norm,parent_norm,child_mapped,child_map_type,parent_mapped,parent_map_type,found_child,found_parent,correct,path,note
2,Weser–Rhine Germanic,West Germanic languages,weser–rhine germanic,west germanic languages,germanic,manual,west germanic,manual,True,True,False,,mapped_but_not_ancestor
3,Low Franconian languages,Weser–Rhine Germanic,low franconian languages,weser–rhine germanic,,none,germanic,manual,False,True,False,,child_not_found
4,Old Franconian,Low Franconian languages,old franconian,low franconian languages,,none,,none,False,False,False,,child_not_found; parent_not_found
5,Old Dutch,Old Franconian,old dutch,old franconian,old dutch,exact,,none,True,False,False,,parent_not_found
6,Limburgish,Old Franconian,limburgish,old franconian,,none,,none,False,False,False,,child_not_found; parent_not_found
7,Middle Dutch,Old Dutch,middle dutch,old dutch,middle dutch,exact,old dutch,exact,True,True,False,,mapped_but_not_ancestor
8,Brabantian,Middle Dutch,brabantian,middle dutch,,none,middle dutch,exact,False,True,False,,child_not_found
9,East Flemish,Middle Dutch,east flemish,middle dutch,,none,middle dutch,exact,False,True,False,,child_not_found
10,Hollandic,Middle Dutch,hollandic,middle dutch,,none,middle dutch,exact,False,True,False,,child_not_found
11,Dutch,Middle Dutch,dutch,middle dutch,dutch,exact,middle dutch,exact,True,True,False,,mapped_but_not_ancestor


In [7]:
# Clean row-by-row validator with scoring (paste into Colab/Jupyter)
# - Expects Glottolog CSV and extracted CSV to be present (edit paths below).
# - Uses mapping_dict if present in the notebook, else tries MANUAL_MAP_CSV.
# - No fuzzy matching: manual map is authoritative.

import pandas as pd
import networkx as nx
import unicodedata
from pathlib import Path
from collections import defaultdict

# ---------- USER CONFIG ----------
GLOTTO_PATH = "/content/unbounded_language_family_trees.csv"   # Glottolog CSV (child, parent columns)
EXTRACTED_PATH = "/content/language_relationships.csv"        # your extracted relationships CSV
MANUAL_MAP_CSV = "/content/final_manual_map.csv"              # optional CSV (extracted_node,candidate)
OUT_ROWS_CSV = "/content/validation_row_by_row.csv"
OUT_SUMMARY_CSV = "/content/validation_summary_row_by_row.csv"
# Languages to scope ground-truth components (use same list you tested earlier)
SCOPE_LANGUAGES = ["Dutch","Spanish","English","French","Bengali","Portuguese","Russian","Japanese","Sanskrit","Marathi"]
# -------------------------------

def norm(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return s.lower()

def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    colmap = {}
    for c in df.columns:
        cl = c.lower().strip()
        if cl in ('child_name','childname','child'):
            colmap[c] = 'child'
        elif cl in ('parent_name','parentname','parent'):
            colmap[c] = 'parent'
        elif 'relationship' in cl:
            colmap[c] = 'relationship'
        elif cl == 'language':
            colmap[c] = 'language'
        else:
            colmap[c] = cl
    return df.rename(columns=colmap)

# Load CSVs
gdf = pd.read_csv(GLOTTO_PATH)
edf = pd.read_csv(EXTRACTED_PATH)
gdf = normalize_cols(gdf)
edf = normalize_cols(edf)

# sanity checks
if 'child' not in gdf.columns or 'parent' not in gdf.columns:
    raise ValueError("Glottolog CSV missing child/parent columns. Found: " + str(list(gdf.columns)))
if 'child' not in edf.columns or 'parent' not in edf.columns:
    raise ValueError("Extracted CSV missing child/parent columns. Found: " + str(list(edf.columns)))

# Build Glottolog directed graph (child -> parent)
gdf = gdf.dropna(subset=['child','parent']).copy()
gdf['child_n'] = gdf['child'].map(norm)
gdf['parent_n'] = gdf['parent'].map(norm)
G_gl = nx.DiGraph()
G_gl.add_edges_from([(r.child_n, r.parent_n) for r in gdf.itertuples()])
glotto_nodes = set(G_gl.nodes)

# Prepare manual map
manual_map = {}
if 'mapping_dict' in globals() and isinstance(mapping_dict, dict):
    for k,v in mapping_dict.items():
        if pd.isna(k) or pd.isna(v):
            continue
        manual_map[norm(k)] = norm(v)
    print(f"Using mapping_dict from notebook ({len(manual_map)} entries).")
else:
    # Try to load CSV if exists
    try:
        mm = pd.read_csv(MANUAL_MAP_CSV)
        if 'extracted_node' in mm.columns and 'candidate' in mm.columns:
            for _, r in mm.iterrows():
                k = norm(r['extracted_node'])
                v = norm(r['candidate'])
                if k:
                    manual_map[k] = v
            print(f"Loaded manual_map from {MANUAL_MAP_CSV} ({len(manual_map)} entries).")
        else:
            print("Manual map CSV present but missing expected columns 'extracted_node' and 'candidate'. Using empty manual_map.")
    except FileNotFoundError:
        print("No mapping_dict variable and no manual_map CSV found. Using empty manual_map.")

# Prepare extracted rows (keep all rows)
edf = edf.dropna(subset=['child','parent']).copy()
edf['child_orig'] = edf['child'].astype(str)
edf['parent_orig'] = edf['parent'].astype(str)
edf['child_n'] = edf['child_orig'].map(norm)
edf['parent_n'] = edf['parent_orig'].map(norm)

def map_name(name_n):
    """Map normalized name -> glottolog normalized node using exact then manual_map.
       Returns mapped_name (normalized) and map_type.
    """
    if not name_n:
        return "", "none"
    if name_n in glotto_nodes:
        return name_n, "exact"
    if name_n in manual_map:
        mapped = manual_map[name_n]
        if mapped in glotto_nodes:
            return mapped, "manual"
        else:
            return mapped, "manual_not_in_glottolog"
    return "", "none"

# Row-by-row validation
rows = []
for r in edf.itertuples(index=False):
    child_orig = r.child_orig
    parent_orig = r.parent_orig
    child_n = r.child_n
    parent_n = r.parent_n

    child_m, child_map_type = map_name(child_n)
    parent_m, parent_map_type = map_name(parent_n)

    found_child = (child_m != "" and child_m in glotto_nodes)
    found_parent = (parent_m != "" and parent_m in glotto_nodes)
    note_parts = []

    if not found_child:
        note_parts.append("child_not_found")
    if not found_parent:
        note_parts.append("parent_not_found")

    correct = False
    path = []
    if found_child and found_parent:
        # check upward reachability: child -> ... -> parent
        if nx.has_path(G_gl, child_m, parent_m):
            path = nx.shortest_path(G_gl, child_m, parent_m)
            correct = True
            note_parts.append("correct")
        else:
            note_parts.append("mapped_but_not_ancestor")

    rows.append({
        'child_orig': child_orig,
        'parent_orig': parent_orig,
        'child_norm': child_n,
        'parent_norm': parent_n,
        'child_mapped': child_m,
        'child_map_type': child_map_type,
        'parent_mapped': parent_m,
        'parent_map_type': parent_map_type,
        'found_child': found_child,
        'found_parent': found_parent,
        'correct': bool(correct),
        'path_len': len(path)-1 if path else None,
        'path': " > ".join(path) if path else "",
        'note': "; ".join(note_parts)
    })

res_df = pd.DataFrame(rows)
res_df.to_csv(OUT_ROWS_CSV, index=False)
print("Per-row diagnostics written to:", OUT_ROWS_CSV)

# ---------- Metrics ----------
total_extracted = len(res_df)
tp = int(res_df['correct'].sum())
# FP = rows where both endpoints found but not ancestor (mapped_but_not_ancestor)
fp = int(((res_df['found_child']) & (res_df['found_parent']) & (res_df['correct'] == False)).sum())
not_found = int(((res_df['found_child'] == False) | (res_df['found_parent'] == False)).sum())

# Ground-truth edges scope: union of connected components for SCOPE_LANGUAGES (same approach we used before)
G_und = G_gl.to_undirected()
comp_union = set()
for lang in SCOPE_LANGUAGES:
    ln = norm(lang)
    if ln in G_und.nodes:
        comp = next(c for c in nx.connected_components(G_und) if ln in c)
        comp_union.update(comp)
# ground-truth direct edges inside the union
gt_edges_in_scope = set([(r.child_n, r.parent_n) for r in gdf.itertuples() if r.child_n in comp_union and r.parent_n in comp_union])
gt_count = len(gt_edges_in_scope)

# Micro metrics
micro_precision = tp / (tp + fp) if (tp + fp) > 0 else None
micro_recall = tp / gt_count if gt_count > 0 else None
micro_f1 = (2*micro_precision*micro_recall/(micro_precision+micro_recall)) if (micro_precision and micro_recall and (micro_precision+micro_recall)>0) else None

# Per-language breakdown (precision/recall/F1)
per_lang_rows = []
for lang in SCOPE_LANGUAGES:
    ln = norm(lang)
    if ln not in G_und.nodes:
        per_lang_rows.append({'language': lang, 'gt_edges': 0, 'extracted': 0, 'tp': 0, 'precision': None, 'recall': None, 'f1': None})
        continue
    comp = next(c for c in nx.connected_components(G_und) if ln in c)
    gt_edges = set([(r.child_n, r.parent_n) for r in gdf.itertuples() if r.child_n in comp and r.parent_n in comp])
    # extracted restricted to nodes in this component
    extracted_here = set([(row.child_mapped, row.parent_mapped) for _, row in res_df.iterrows() if row['child_mapped'] in comp and row['parent_mapped'] in comp])
    # Count TP here = extracted pairs that are ancestor (we already labelled correct by path, but check membership)
    tp_here = 0
    for _, row in res_df.iterrows():
        if row['correct'] and (row['child_mapped'] in comp) and (row['parent_mapped'] in comp):
            tp_here += 1
    ext_count = len(extracted_here)
    gt_count_here = len(gt_edges)
    fp_here = ext_count - tp_here
    precision_here = tp_here / ext_count if ext_count > 0 else None
    recall_here = tp_here / gt_count_here if gt_count_here > 0 else None
    f1_here = (2*precision_here*recall_here/(precision_here+recall_here)) if (precision_here and recall_here and (precision_here+recall_here)>0) else None
    per_lang_rows.append({'language': lang, 'gt_edges': gt_count_here, 'extracted': ext_count, 'tp': tp_here, 'precision': precision_here, 'recall': recall_here, 'f1': f1_here})

per_lang_df = pd.DataFrame(per_lang_rows)

# Macro averages (ignore None)
macro_precision = per_lang_df['precision'].dropna().mean() if per_lang_df['precision'].dropna().shape[0]>0 else None
macro_recall = per_lang_df['recall'].dropna().mean() if per_lang_df['recall'].dropna().shape[0]>0 else None
macro_f1 = per_lang_df['f1'].dropna().mean() if per_lang_df['f1'].dropna().shape[0]>0 else None

summary = {
    'total_extracted_rows': total_extracted,
    'tp': tp,
    'fp': fp,
    'not_found_rows': not_found,
    'gt_edges_in_scope': gt_count,
    'micro_precision': micro_precision,
    'micro_recall': micro_recall,
    'micro_f1': micro_f1,
    'macro_precision': macro_precision,
    'macro_recall': macro_recall,
    'macro_f1': macro_f1
}

pd.DataFrame([summary]).to_csv(OUT_SUMMARY_CSV, index=False)
print("Saved summary to:", OUT_SUMMARY_CSV)
print("\nSUMMARY:")
for k,v in summary.items():
    print(f"{k}: {v}")

print("\nPer-language breakdown (first rows):")
display(per_lang_df.head(len(per_lang_df)))



Using mapping_dict from notebook (77 entries).
Per-row diagnostics written to: /content/validation_row_by_row.csv
Saved summary to: /content/validation_summary_row_by_row.csv

SUMMARY:
total_extracted_rows: 295
tp: 89
fp: 43
not_found_rows: 163
gt_edges_in_scope: 3357
micro_precision: 0.6742424242424242
micro_recall: 0.026511766458147155
micro_f1: 0.05101748351963313
macro_precision: 0.8009615384615385
macro_recall: 0.02742612156320799
macro_f1: 0.053012750598554294

Per-language breakdown (first rows):


Unnamed: 0,language,gt_edges,extracted,tp,precision,recall,f1
0,Dutch,3179,104,81,0.778846,0.02548,0.049345
1,Spanish,3179,104,81,0.778846,0.02548,0.049345
2,English,3179,104,81,0.778846,0.02548,0.049345
3,French,3179,104,81,0.778846,0.02548,0.049345
4,Bengali,3179,104,81,0.778846,0.02548,0.049345
5,Portuguese,3179,104,81,0.778846,0.02548,0.049345
6,Russian,3179,104,81,0.778846,0.02548,0.049345
7,Japanese,178,8,8,1.0,0.044944,0.086022
8,Sanskrit,3179,104,81,0.778846,0.02548,0.049345
9,Marathi,3179,104,81,0.778846,0.02548,0.049345
