In [10]:
import pandas as pd
from nameparser import HumanName
from unidecode import unidecode
from rapidfuzz import fuzz, process

# 1. Load data
df = pd.read_csv("/share/yin/schmidt/intermediate/20251017_combined_grid_and_metascience_fixed.csv")
df.head()
#df.shape

Unnamed: 0,Name,ORCID_list,Institution,Institution_df2,Year_Start,Year_End,Doctorate Completion Date,Doctorate Granting Institution,Cohort_Group,Cohort_Group_df2,Cohort_Type,Alumni,Google Scholars,LinkedIn,Additional Professional Profiles (from Surveys),LinkedIn (from Surveys)
0,Adam Roy,[],,"""University of California, San Diego""",,,,,,4.0,Cohort,,,,,
1,Adam Fouda,['https://orcid.org/0000-0002-9445-5537'],Chicago,The University of Chicago,2023.0,2026.0,4/5/20,"University of Nottingham,",2.0,2.0,Cohort,,scholar.google.com/citations?view_op=view_cita...,,,
2,Aditya Nandy,['https://orcid.org/0000-0001-7137-5449'],Chicago,The University of Chicago,2023.0,2025.0,,"Massachusetts Institute of Technology,",2.0,2.0,Cohort,checked,scholar.google.com/citations?user=gbTUmC8AAAAJ...,www.linkedin.com/in/adityanandy/,,
3,Aidan Crilly,['https://orcid.org/0000-0002-0429-9332'],Imperial,Imperial College London,2023.0,2025.0,6/1/20,"Imperial College London,",1.0,1.0,Cohort,,scholar.google.co.uk/citations?user=ILkvB-EAAA...,www.linkedin.com/in/aidan-crilly-22572ba0/,,
4,Akshay Ajagekar,['https://orcid.org/0000-0001-9493-6050'],Cornell,,2024.0,2025.0,,,3.0,,,,scholar.google.com/citations?user=JCRfLuIAAAAJ,www.linkedin.com/in/akshayajagekar,,


In [11]:
input_path = "/share/yin/schmidt/intermediate/20251017_combined_grid_and_metascience_fixed.csv"
output_path = "/share/yin/nz268/name_matching/name_matching_results.csv"

df = pd.read_csv(input_path)

# === 2. Normalize and parse names ===
def clean_name(name):
    if pd.isna(name):
        return ""
    name = unidecode(str(name)).lower().strip()
    parsed = HumanName(name)
    return {
        "first": parsed.first.strip(),
        "middle": parsed.middle.strip(),
        "last": parsed.last.strip(),
        "full": f"{parsed.first} {parsed.last}".strip()
    }

df["Name_parsed"] = df["Name"].apply(clean_name)

nickname_map = {
    "bob": "robert", "rob": "robert", "liz": "elizabeth",
    "bill": "william", "katie": "katherine", "ajay": "ajai",
    "vijay": "vijai"
}

def replace_nicknames(name_dict):
    first = name_dict["first"]
    if first in nickname_map:
        name_dict["first"] = nickname_map[first]
    name_dict["full"] = f"{name_dict['first']} {name_dict['last']}".strip()
    return name_dict

df["Name_parsed"] = df["Name_parsed"].apply(replace_nicknames)

# === 3. Similarity functions ===
def name_similarity(name1, name2):
    if not name1 or not name2:
        return 0
    score_first = fuzz.token_sort_ratio(name1["first"], name2["first"])
    score_last = fuzz.token_sort_ratio(name1["last"], name2["last"])
    score_full = fuzz.token_sort_ratio(name1["full"], name2["full"])
    return 0.5 * score_last + 0.3 * score_first + 0.2 * score_full

def institution_similarity(inst1, inst2):
    if pd.isna(inst1) or pd.isna(inst2):
        return 0
    return fuzz.token_sort_ratio(unidecode(str(inst1).lower()), unidecode(str(inst2).lower()))

# === 4. Prepare database side ===
db_names = df["Name_parsed"].tolist()
db_insts = df["Institution_df2"].fillna("").tolist()

# === 5. Match loop ===
matches = []

for _, row in df.iterrows():
    fellowship_name = row["Name_parsed"]
    fellowship_inst = row.get("Institution", "")
    
    best_match = None
    best_score = 0
    best_inst = ""
    
    for i, db_name in enumerate(db_names):
        score_name = name_similarity(fellowship_name, db_name)
        score_inst = institution_similarity(fellowship_inst, db_insts[i])
        combined = 0.8 * score_name + 0.2 * score_inst
        
        if combined > best_score:
            best_score = combined
            best_match = db_name
            best_inst = db_insts[i]
    
    # Label confidence
    if best_score >= 90:
        note = "High confidence"
    elif best_score >= 75:
        note = "Medium confidence"
    else:
        note = "Low confidence"
    
    matches.append({
        "Name": row["Name"],
        "Institution": fellowship_inst,
        "Best_Matched_Name": f"{best_match['first'].title()} {best_match['last'].title()}",
        "Best_Matched_Institution": best_inst,
        "Match_Score": round(best_score, 2),
        "Notes": note
    })

# === 6. Save match CSV ===
match_df = pd.DataFrame(matches)
match_df.to_csv(output_path, index=False)

print(f"✅ Match CSV created successfully at:\n{output_path}")
print(match_df.head())

✅ Match CSV created successfully at:
/share/yin/nz268/name_matching/name_matching_results.csv
              Name Institution Best_Matched_Name  \
0        Adam  Roy         NaN          Adam Roy   
1       Adam Fouda     Chicago        Adam Fouda   
2     Aditya Nandy     Chicago      Aditya Nandy   
3     Aidan Crilly    Imperial      Aidan Crilly   
4  Akshay Ajagekar     Cornell   Akshay Ajagekar   

                Best_Matched_Institution  Match_Score              Notes  
0  "University of California, San Diego"        80.00  Medium confidence  
1              The University of Chicago        88.75  Medium confidence  
2              The University of Chicago        88.75  Medium confidence  
3                Imperial College London        90.32    High confidence  
4                                               80.00  Medium confidence  


In [16]:
import pandas as pd
from nameparser import HumanName
from unidecode import unidecode
from rapidfuzz import fuzz
from itertools import product

# 1. File paths 
input_path = "/share/yin/schmidt/intermediate/20251017_combined_grid_and_metascience_fixed.csv"
nicknames_path = "/share/yin/bing/schmidt_sciences/stage_1/names.csv"
output_path = "/share/yin/nz268/name_matching/name_matching_results.csv"

# 2. Load fellowship/database data 
df = pd.read_csv(input_path)
print(f"Loaded dataset with {len(df)} rows.")

# 3. Normalize and parse names 
def clean_name(name):
    """Normalize a name (remove accents, lowercase, parse first/middle/last)."""
    if pd.isna(name):
        return {"first": "", "middle": "", "last": "", "full": ""}
    name = unidecode(str(name)).lower().strip()
    parsed = HumanName(name)
    return {
        "first": parsed.first.strip(),
        "middle": parsed.middle.strip(),
        "last": parsed.last.strip(),
        "full": f"{parsed.first} {parsed.last}".strip()
    }

df["Name_parsed"] = df["Name"].apply(clean_name)

# 4. Load and construct nickname map 
nicknames_map = pd.read_csv(nicknames_path)

# Map name → set of nicknames
nicknames_per_name = nicknames_map.groupby("name1")["name2"].agg(set).to_dict()
# Map nickname → set of full names
names_per_nickname = nicknames_map.groupby("name2")["name1"].agg(set).to_dict()

# Merge both directions into one unified dictionary
final_nicknames_map = nicknames_per_name.copy()
for name in names_per_nickname:
    final_nicknames_map[name] = (
        final_nicknames_map.get(name, set()).union(names_per_nickname[name])
    )

print(f"Nickname map loaded with {len(final_nicknames_map)} entries.")

# 5. Similarity functions
def name_similarity_with_nicknames(name1, name2, nickname_map):
    """Compute fuzzy similarity between two names, accounting for nicknames."""
    if not name1 or not name2:
        return 0

    # Generate variant sets for first names
    variants1 = {name1["first"]} | nickname_map.get(name1["first"], set())
    variants2 = {name2["first"]} | nickname_map.get(name2["first"], set())

    # Compare all nickname combinations
    best_first_score = max(
        fuzz.token_sort_ratio(v1, v2)
        for v1, v2 in product(variants1, variants2)
    )

    # Compare last and full name normally
    score_last = fuzz.token_sort_ratio(name1["last"], name2["last"])
    score_full = fuzz.token_sort_ratio(name1["full"], name2["full"])

    # Weighted score: last name most important
    return 0.5 * score_last + 0.3 * best_first_score + 0.2 * score_full


def institution_similarity(inst1, inst2):
    """Compute fuzzy similarity between two institution names."""
    if pd.isna(inst1) or pd.isna(inst2):
        return 0
    return fuzz.token_sort_ratio(unidecode(str(inst1).lower()), unidecode(str(inst2).lower()))

# 6. Prepare reference lists 
db_names = df["Name_parsed"].tolist()
db_insts = df["Institution_df2"].fillna("").tolist()

# 7. Match loop 
matches = []

for _, row in df.iterrows():
    fellowship_name = row["Name_parsed"]
    fellowship_inst = row.get("Institution", "")

    best_match = None
    best_score = 0
    best_inst = ""

    # Compare with all database names
    for i, db_name in enumerate(db_names):
        score_name = name_similarity_with_nicknames(fellowship_name, db_name, final_nicknames_map)
        score_inst = institution_similarity(fellowship_inst, db_insts[i])
        combined = 0.8 * score_name + 0.2 * score_inst

        if combined > best_score:
            best_score = combined
            best_match = db_name
            best_inst = db_insts[i]

    # Confidence labels
    if best_score >= 90:
        note = "High confidence"
    elif best_score >= 75:
        note = "Medium confidence"
    else:
        note = "Low confidence"

    matches.append({
        "Name": row["Name"],
        "Institution": fellowship_inst,
        "Best_Matched_Name": f"{best_match['first'].title()} {best_match['last'].title()}",
        "Best_Matched_Institution": best_inst,
        "Match_Score": round(best_score, 2),
        "Notes": note
    })

# 8. Save results
match_df = pd.DataFrame(matches)
match_df.to_csv(output_path, index=False)

# Save low-confidence matches for manual review
low_conf_df = match_df[match_df["Notes"] == "Low confidence"]
low_conf_path = output_path.replace(".csv", "_low_confidence.csv")
low_conf_df.to_csv(low_conf_path, index=False)

print(f"\n✅ Match CSV created successfully at:\n{output_path}")
print(f"⚠️  Low-confidence matches saved separately at:\n{low_conf_path}")
print("\nPreview:")
print(match_df.head())


Loaded dataset with 277 rows.
Nickname map loaded with 2163 entries.

✅ Match CSV created successfully at:
/share/yin/nz268/name_matching/name_matching_results.csv
⚠️  Low-confidence matches saved separately at:
/share/yin/nz268/name_matching/name_matching_results_low_confidence.csv

Preview:
              Name Institution Best_Matched_Name  \
0        Adam  Roy         NaN          Adam Roy   
1       Adam Fouda     Chicago        Adam Fouda   
2     Aditya Nandy     Chicago      Aditya Nandy   
3     Aidan Crilly    Imperial      Aidan Crilly   
4  Akshay Ajagekar     Cornell   Akshay Ajagekar   

                Best_Matched_Institution  Match_Score              Notes  
0  "University of California, San Diego"        80.00  Medium confidence  
1              The University of Chicago        88.75  Medium confidence  
2              The University of Chicago        88.75  Medium confidence  
3                Imperial College London        90.32    High confidence  
4                 

In [17]:
# Save High-confidence matches 
high_conf_df = match_df[match_df["Notes"] == "High confidence"]
high_conf_path = output_path.replace(".csv", "_high_confidence.csv")
high_conf_df.to_csv(high_conf_path, index=False)
print(f"High-confidence matches saved at: {high_conf_path}")

# Save Medium-confidence matches 
medium_conf_df = match_df[match_df["Notes"] == "Medium confidence"]
medium_conf_path = output_path.replace(".csv", "_medium_confidence.csv")
medium_conf_df.to_csv(medium_conf_path, index=False)
print(f"Medium-confidence matches saved at: {medium_conf_path}")

High-confidence matches saved at: /share/yin/nz268/name_matching/name_matching_results_high_confidence.csv
Medium-confidence matches saved at: /share/yin/nz268/name_matching/name_matching_results_medium_confidence.csv


### Match_Score = weighted fuzzy similarity (0–100)

Fuzzy similarity (fuzz.token_sort_ratio) compares fellowship and database names.
Each component (first, last, full name) gives a score between 0 and 100.

1) Name-level weighting:
- Name_Score = 0.5 * Last + 0.3 * First + 0.2 * Full
    - last name is most important, first name slightly less, full name least.

2) Institution adjustment:
- Final_Score = 0.8 * Name_Score + 0.2 * Institution_Score
    - institution adds minor weight to refine the match.

3) Confidence labels:
- ≥ 90 → High confidence
- 75–89 → Medium confidence
- < 75 → Low confidence