# Notebook 1 – Data Cleaning, Feature Engineering, & Entity Resolution
**Project:** Judicial Vacancy → Nomination/Confirmation Pipeline

*Initial draft generated via ChatGPT model o3 on 2025-07-12T02:40:38.399372Z*

In [None]:

import json
import os
import re
import string
import sys
from datetime import datetime
from pathlib import Path

import pandas as pd
from loguru import logger
from rapidfuzz import fuzz, process

# Add the project root to the path so we can import our modules
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from nomination_predictor.config import INTERIM_DATA_DIR, RAW_DATA_DIR
from nomination_predictor.congress_api_utils import \
    enrich_congress_nominees_dataframe

# Setup logging
logger.remove()  # Remove default handler
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{function}</cyan> - <level>{message}</level>", level="INFO")

6

In [None]:

# ---------------------------------------------------------------------------
# Helper: clean / normalize names
def clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = str(name).upper()
    name = re.sub(r"[\.,]", "", name)          # drop punctuation
    name = re.sub(r"\s+", " ", name).strip()
    return name

def split_name(name: str):
    """
    Very naive splitter: returns first, middle (maybe empty), last
    """
    parts = clean_name(name).split()
    if not parts:
        return "", "", ""
    if len(parts) == 1:
        return parts[0], "", ""
    if len(parts) == 2:
        return parts[0], "", parts[1]
    return parts[0], " ".join(parts[1:-1]), parts[-1]

def normalised_court(text: str) -> str:
    if pd.isna(text):
        return ""
    text = text.upper().replace("UNITED STATES", "").replace("U.S.", "").strip()
    text = re.sub(r"\s+", " ", text)
    return text

In [None]:
from nomination_predictor.feature_engineering_helpers import (
    analyze_match_failures, load_and_prepare_dataframes)

# Load and prepare all dataframes
dfs = load_and_prepare_dataframes(RAW_DATA_DIR)
cong_nominees = dfs["cong_nominees"]  # This now has all the derived fields
fjc_judges    = dfs["fjc_judges"]   
fjc_service   = dfs["fjc_service"]
cong_noms     = dfs["cong_noms"]

print("Loaded:",
      len(fjc_judges), "judges",
      len(fjc_service), "service records",
      len(cong_nominees), "congress nominees",
      len(cong_noms), "nominations")

[32m2025-07-12 00:44:47[0m | [1mINFO[0m | [36mload_and_prepare_dataframes[0m - [1mLoaded 4022 judges, 4720 service records, 285 congress nominees, 285 nominations[0m


Loaded: 4022 judges 4720 service records 285 congress nominees 285 nominations


In [None]:
# --- Clean Congress nominees ------------------------------------------------
cong_nominees["full_name_clean"] = cong_nominees["full_name"].apply(clean_name)
cong_nominees[["first","middle","last"]] = cong_nominees["full_name_clean"].apply(
    lambda n: pd.Series(split_name(n)))

cong_nominees["court_clean"] = cong_nominees["organization"].apply(normalised_court)
cong_nominees["nomination_date"] = pd.to_datetime(cong_nominees["nomination_date"])

# --- Clean FJC judges -------------------------------------------------------
fjc_judges["full_name_clean"] = fjc_judges["name_full"].apply(clean_name)
fjc_judges[["first","middle","last"]] = fjc_judges["full_name_clean"].apply(
    lambda n: pd.Series(split_name(n)))

# We'll need a mapping from nid to service records for date & court validation
fjc_service["court_clean"] = fjc_service["court_name"].apply(normalised_court)
fjc_service["nomination_date"] = pd.to_datetime(fjc_service["nomination_date"], errors="coerce")
fjc_service["commission_date"] = pd.to_datetime(fjc_service["commission_date"], errors="coerce")

In [None]:

# Block by last name exact match
blocks = {}
for lname, group in fjc_judges.groupby("last"):
    blocks[lname] = group

def candidate_fjc_rows(row):
    return blocks.get(row["last"], pd.DataFrame())

In [None]:

def best_match(row):
    candidates = candidate_fjc_rows(row)
    if candidates.empty:
        return pd.NA, 0.0
    # Compute combined score: name similarity + court similarity + date proximity
    best_score = 0.0
    best_nid = pd.NA
    for _, cand in candidates.iterrows():
        name_score = fuzz.token_set_ratio(row["full_name_clean"], cand["full_name_clean"])
        # Use service records to find any matching nomination date
        entries = fjc_service[fjc_service["nid"] == cand["nid"]]
        date_score = 0
        court_score = 0
        if not entries.empty:
            # Smallest absolute diff in days
            diffs = (entries["nomination_date"] - row["nomination_date"]).abs().dt.days
            date_score = 100 - diffs.min() if diffs.notna().any() else 0
            # any court string overlap
            if row["court_clean"]:
                if any(row["court_clean"] in c for c in entries["court_clean"]):
                    court_score = 100
                else:
                    court_score = max(fuzz.partial_ratio(row["court_clean"], c) for c in entries["court_clean"])
        total = 0.6*name_score + 0.3*date_score + 0.1*court_score
        if total > best_score:
            best_score, best_nid = total, cand["nid"]
    return best_nid, round(best_score,1)

cong_nominees[["match_nid","match_score"]] = cong_nominees.apply(
    best_match, axis=1, result_type="expand")

In [None]:

THRESHOLD = 80
matches = cong_nominees[cong_nominees["match_score"] >= THRESHOLD].copy()
print(f"Matched {len(matches)}/{len(cong_nominees)} nominees with score ≥ {THRESHOLD}")
matches.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_matches.csv", index=False)

# Save the cleaned interim datasets for downstream notebooks
cong_nominees.to_csv(INTERIM_DATA_DIR / "congress_nominees_cleaned.csv", index=False)
fjc_judges.to_csv(INTERIM_DATA_DIR / "fjc_judges_cleaned.csv", index=False)
fjc_service.to_csv(INTERIM_DATA_DIR / "fjc_service_cleaned.csv", index=False)

Matched 140/285 nominees with score ≥ 80


In [16]:
THRESHOLD = 80
matches = cong_nominees[cong_nominees["match_score"] >= THRESHOLD].copy()
print(f"Matched {len(matches)}/{len(cong_nominees)} nominees with score ≥ {THRESHOLD}")

# Analyze unmatched records to understand why they didn't match
unmatched_df, reason_summary, examples = analyze_match_failures(cong_nominees, THRESHOLD)

# Display summary of failure reasons
print("\nFailure Reason Summary:")
display(reason_summary)

# Display a few examples of each failure type
print("\nExample records for each failure type:")
for reason, example_df in examples.items():
    print(f"\n{reason}:")
    display(example_df)

# Save both matched and unmatched datasets for further analysis
matches.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_matches.csv", index=False)
unmatched_df.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_unmatched.csv", index=False)

# Save the cleaned interim datasets for downstream notebooks
cong_nominees.to_csv(INTERIM_DATA_DIR / "congress_nominees_cleaned.csv", index=False)
fjc_judges.to_csv(INTERIM_DATA_DIR / "fjc_judges_cleaned.csv", index=False)
fjc_service.to_csv(INTERIM_DATA_DIR / "fjc_service_cleaned.csv", index=False)

Matched 140/285 nominees with score ≥ 80

Failure Reason Summary:


Unnamed: 0,Failure Reason,Count
0,No potential match candidates found,121
1,Very low similarity - likely different person,5
2,Marginal match (score 76.5) - check name and c...,3
3,Marginal match (score 78.5) - check name and c...,2
4,Marginal match (score 66.6) - check name and c...,2
5,Marginal match (score 67.4) - check name and c...,1
6,Marginal match (score 51.7) - check name and c...,1
7,Marginal match (score 61.8) - check name and c...,1
8,Marginal match (score 77.3) - check name and c...,1
9,Marginal match (score 78.4) - check name and c...,1



Example records for each failure type:

No potential match candidates found:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
0,James Graham Lake,THE JUDICIARY,0.0,No potential match candidates found
1,Nicholas George Miranda,THE JUDICIARY,0.0,No potential match candidates found
5,Philip S. Hadji,THE JUDICIARY,0.0,No potential match candidates found



Very low similarity - likely different person:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
2,Lisa W. Wang,THE JUDICIARY,32.9,Very low similarity - likely different person
20,Joshua Paul Kolar,THE JUDICIARY,45.7,Very low similarity - likely different person
21,Eumi K. Lee,THE JUDICIARY,40.4,Very low similarity - likely different person



Marginal match (score 76.5) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
13,David Seymour Leibowitz,THE JUDICIARY,76.5,Marginal match (score 76.5) - check name and c...
24,Jacqueline Becerra,THE JUDICIARY,76.5,Marginal match (score 76.5) - check name and c...
26,Melissa Damian,THE JUDICIARY,76.5,Marginal match (score 76.5) - check name and c...



Marginal match (score 78.5) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
34,Gretchen S. Lund,THE JUDICIARY,78.5,Marginal match (score 78.5) - check name and c...
36,Nicole G. Berner,THE JUDICIARY,78.5,Marginal match (score 78.5) - check name and c...



Marginal match (score 66.6) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
23,Edward Sunyol Kiel,THE JUDICIARY,66.6,Marginal match (score 66.6) - check name and c...
25,Sarah French Russell,THE JUDICIARY,66.6,Marginal match (score 66.6) - check name and c...



Marginal match (score 67.4) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
14,Seth Robert Aframe,THE JUDICIARY,67.4,Marginal match (score 67.4) - check name and c...



Marginal match (score 51.7) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
6,Joseph Albert Laroski Jr.,THE JUDICIARY,51.7,Marginal match (score 51.7) - check name and c...



Marginal match (score 61.8) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
16,Mustafa Taher Kasubhai,THE JUDICIARY,61.8,Marginal match (score 61.8) - check name and c...



Marginal match (score 77.3) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
31,Amy M. Baggio,THE JUDICIARY,77.3,Marginal match (score 77.3) - check name and c...



Marginal match (score 78.4) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
35,Kirk Edward Sherriff,THE JUDICIARY,78.4,Marginal match (score 78.4) - check name and c...



Marginal match (score 75.7) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
37,Julie Simone Sneed,THE JUDICIARY,75.7,Marginal match (score 75.7) - check name and c...



Marginal match (score 58.5) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
51,Roy W. Minter Jr.,DEPARTMENT OF JUSTICE,58.5,Marginal match (score 58.5) - check name and c...



Marginal match (score 55.1) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
65,Carmen G. Iguina Gonzalez,THE JUDICIARY,55.1,Marginal match (score 55.1) - check name and c...



Marginal match (score 59.8) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
120,David O. Barnett Jr.,DEPARTMENT OF JUSTICE,59.8,Marginal match (score 59.8) - check name and c...



Marginal match (score 58.0) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
153,Charles J. Willoughby Jr.,THE JUDICIARY,58.0,Marginal match (score 58.0) - check name and c...



Marginal match (score 52.4) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
160,Charles J. Willoughby Jr.,THE JUDICIARY,52.4,Marginal match (score 52.4) - check name and c...



Marginal match (score 64.0) - check name and court:


Unnamed: 0,full_name,court_clean,match_score,failure_reason
212,Roy W. Minter Jr.,DEPARTMENT OF JUSTICE,64.0,Marginal match (score 64.0) - check name and c...
