# Notebook 1 – Data Cleaning, Feature Engineering, & Entity Resolution
**Project:** Judicial Vacancy → Nomination/Confirmation Pipeline

*Initial draft generated via ChatGPT model o3 on 2025-07-12T02:40:38.399372Z*

In [None]:

import sys
from pathlib import Path

import pandas as pd
from loguru import logger
from rapidfuzz import fuzz, process

# Add the project root to the path so we can import our modules
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))


# Setup logging
logger.remove()  # Remove default handler
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{function}</cyan> - <level>{message}</level>", level="INFO")

1

## Load dataframes from Raw data folder

Start with loading simpler, non-JSON-containing CSV files

In [None]:
from nomination_predictor.config import INTERIM_DATA_DIR, RAW_DATA_DIR

# load FJC dataframes (and derived seat timeline)
fjc_judges = pd.read_csv(RAW_DATA_DIR / "judges.csv")
fjc_federal_judicial_service = pd.read_csv(RAW_DATA_DIR / "federal_judicial_service.csv")
fjc_demographics = pd.read_csv(RAW_DATA_DIR / "demographics.csv")
fjc_education = pd.read_csv(RAW_DATA_DIR / "education.csv")
fjc_other_federal_judicial_service = pd.read_csv(
    RAW_DATA_DIR / "other_federal_judicial_service.csv"
)
fjc_other_nominations_recess = pd.read_csv(RAW_DATA_DIR / "other_nominations_recess.csv")
seat_timeline = pd.read_csv(RAW_DATA_DIR / "seat_timeline.csv")

[32m2025-07-13 12:02:27.407[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m103[0m - [1mProject root: /home/wsl2ubuntuuser/nomination_predictor[0m
[32m2025-07-13 12:02:27.409[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m127[0m - [1mConfiguration loaded[0m


In [None]:
# Load Congress API dataframes
cong_nominations = pd.read_csv(RAW_DATA_DIR / "nominations.csv")
cong_nominees = pd.read_csv(RAW_DATA_DIR / "nominees.csv")

Explode JSON-containing congress DataFrames into separate DataFrames

In [None]:
from nomination_predictor.features import (explode_nomination_json,
                                           explode_nominee_json)

# Explode nomination JSON data
exploded_nominations = explode_nomination_json(cong_nominations)
# Extract each dataframe as a separate variable for IDE inspection
cong_nomination_core = exploded_nom['nominations']
cong_nomination_nominees = exploded_nom['nominees']
cong_nomination_actions = exploded_nom['actions']
cong_nomination_committees = exploded_nom['committees']
cong_nomination_hearings = exploded_nom['hearings']

[32m2025-07-13 12:02:28.033[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m665[0m - [1mProcessing 5746 nomination records[0m


Extracting JSON data: 100%|██████████| 5746/5746 [00:13<00:00, 426.12it/s] 


[32m2025-07-13 12:02:41.585[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m834[0m - [1mExtracted 5746 nomination records[0m
[32m2025-07-13 12:02:41.586[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m835[0m - [1mExtracted 0 nominee records[0m
[32m2025-07-13 12:02:41.586[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m836[0m - [1mExtracted 5513 action records[0m
[32m2025-07-13 12:02:41.586[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m837[0m - [1mExtracted 5453 committee records[0m
[32m2025-07-13 12:02:41.587[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mexplode_nomination_json[0m:[36m838[0m - [1mExtracted 864 hearing records[0m


NameError: name 'exploded_nom' is not defined

In [None]:
# Explode nominee JSON data
exploded_nominee = explode_nominee_json(cong_nominees)
# Extract each dataframe as a separate variable for IDE inspection
cong_nominee_core = exploded_nominee['nominees']
cong_nominee_orgs = exploded_nominee['organizations']
cong_nominee_edu = exploded_nominee['educational_history']

In [None]:
# Combine all dataframes into a single dictionary for bulk operations
# Start with FJC dataframes
dfs = {
    # FJC dataframes
    "fjc_judges": fjc_judges,
    "fjc_federal_judicial_service": fjc_federal_judicial_service,
    "fjc_demographics": fjc_demographics,
    "fjc_education": fjc_education,
    "fjc_other_federal_judicial_service": fjc_other_federal_judicial_service,
    "fjc_other_nominations_recess": fjc_other_nominations_recess,
    "seat_timeline": seat_timeline,
    
    # Congress raw dataframes
    "cong_nominations": cong_nominations,
    "cong_nominees": cong_nominees,
    
    # Exploded nomination dataframes
    "cong_nomination_core": cong_nomination_core,
    "cong_nomination_nominees": cong_nomination_nominees,
    "cong_nomination_actions": cong_nomination_actions,
    "cong_nomination_committees": cong_nomination_committees,
    "cong_nomination_hearings": cong_nomination_hearings,
    
    # Exploded nominee dataframes
    "cong_nominee_core": cong_nominee_core,
    "cong_nominee_orgs": cong_nominee_orgs,
    "cong_nominee_edu": cong_nominee_edu
}

In [None]:
# Print summary of available dataframes
print("Available dataframes:")
for name, df in dfs.items():
    print(f"- {name}: {len(df)} rows × {len(df.columns)} columns")

JSON-containing files we can explode and/or flatten several different ways.  Whichever one is best depends on the use case.  Below is the method I settled on so far:

In [None]:
# Check uniqueness of 'citation' field in all dataframes
from nomination_predictor.dataset import check_id_uniqueness

print("Checking uniqueness of nomination/nominee identifiers...")

# Check each DataFrame for uniqueness of citation field
for name, df in dfs.items():
    if name.startswith("cong"):
        print(f"\n- Checking {name}...")
        if 'citation' in df.columns:
            try:
                check_id_uniqueness(df, id_field='citation')
                print(f"  ✅ Passed: 'citation' is unique in {name}")
            except Exception as e:
                print(f"  ❌ Failed: {e}")
        else:
            print(f"  ⚠️ Skipped: 'citation' column not found in {name}")

In [None]:
# commented this cell out because IMO it's too early in this notebook to be worthwhile to save these as CSVs

## Save extracted tables to interim directory
for name, df in dfs.items():
    if len(df) > 0:  # Only save non-empty DataFrames
        output_path = INTERIM_DATA_DIR / f"{name}.csv"
        df.to_csv(output_path, index=False)
        print(f"Saved {len(df)} records to {output_path}")

#### Quick peek at all loaded dataframes

In [None]:
logger.info("Checking for general shape and first handfuls of rows")
for name, df in dfs.items():
    print(f"{name:<35} → {df.shape}")
    print(df.head())  

In [None]:
logger.info("Checking for null values")
    
for name, df in dfs.items():
    print(df.isnull().sum())

## Data cleaning

## Normalize column names for DataFrames

In [None]:
print("=== Column Names Before ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

In [None]:
# call features.py's normalize_columns function on all DataFrames in dfs, and strip leading and trailing whitespace in all strings
from nomination_predictor.features import normalize_dataframe_columns

for name, df in dfs.items():
    df = normalize_dataframe_columns(df)
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    dfs[name] = df

In [None]:
print("=== Column Names After ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

### Drop non-judge roles from nominations & nominees

In [None]:
# Filter out non-judicial nominations using the function from features.py
from nomination_predictor.features import filter_non_judicial_nominations

# Define non-judicial titles to filter out
non_judicial_titles = [
    "Attorney", "Board", "Commission", "Director", "Marshal",
    "Assistant", "Representative", "Secretary of", "Member of"
]

# Apply the filter
# FIXME: uncomment when we've downloaded enough nominees to be able to operate on
#dfs["cong_nomination_nominations"], dfs["cong_nominees"] = filter_non_judicial_nominations(
#    dfs["cong_nomination_nominations"],
#    dfs["cong_nominees"],
#    non_judicial_titles=non_judicial_titles
#)

### Convert date strings to datetime objects

In [None]:
# for any columns which contain certain keywords in their column name and contain string values, convert from string to datetime
datetime_related_keywords = ("date", "year", "month")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col for keyword in datetime_related_keywords) and df[col].dtype == "object":
            logger.info(f"Converting {col} to datetime for {name}")
            df[col] = pd.to_datetime(df[col], errors="coerce")

### Normalize court names

In [None]:
# for any columns which contain certain keywords in their column name and contain string values, casefold and replace spans of one or more interstitial spaces with underscores
court_describing_keywords = ("court", "circuit", "district")


for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col.casefold() for keyword in court_describing_keywords) and df[col].dtype == object:
            logger.info(F"Normalizing column named {col} in {name}")
            df[col] = df[col].str.casefold().str.replace(r'\s+', '_')

### Count and display unique values under each column

In [None]:
# display counts of nique values in DataFrame columns:
for name, df in dfs.items():
    for col in sorted(df.columns):
     print(f"{name} - {col}: {df[col].nunique()} unique values")

### Set nid and citation as index for whichever dataframes intentionally use them uniquely

In [None]:
# For the dataframes that have unique nid, set them as the index to optimize lookups/joins
dfs["fjc_judges"].set_index('nid', drop=False, inplace=True, verify_integrity=True)
dfs["fjc_demographics"].set_index('nid', drop=False, inplace=True, verify_integrity=True)


# congress' nominee dataframes are more consistent than the nominations dataframes about 'citation' field being unique
for name, df in dfs.items():
    if name.startswith("cong_nominees_"):
        logger.info(f"Setting index for {name}")
        df.set_index('citation', drop=False, inplace=True, verify_integrity=True)

## Fuzzy-matching FJC judges to Congress.gov nominees

In [None]:
# Enrich the nominees dataframe with name fields and court information from nominations for the sake of the fuzzy-matcher
from nomination_predictor.features import (enrich_congress_nominees_dataframe,
                                           enrich_fjc_judges)

dfs.cong_nominees = enrich_congress_nominees_dataframe(dfs.cong_nominees, dfs.cong_nominations)

# Enrich the FJC judges dataframe with full name fields for the sake of the fuzzy-matcher
fjc_combined = enrich_fjc_judges(fjc_combined)

In [None]:
# Block by last name exact match
blocks = {}
for lname, group in fjc_judges.groupby("last"):
    blocks[lname] = group

def candidate_fjc_rows(row):
    return blocks.get(row["last"], pd.DataFrame())

In [None]:
# --- Clean Congress nominees ------------------------------------------------
cong_nominees["full_name_clean"] = cong_nominees["full_name"].apply(clean_name)
cong_nominees[["first","middle","last"]] = cong_nominees["full_name_clean"].apply(
    lambda n: pd.Series(split_name(n)))

cong_nominees["court_clean"] = cong_nominees["organization"].apply(normalised_court)
cong_nominees["nomination_date"] = pd.to_datetime(cong_nominees["nomination_date"])

# --- Clean FJC judges -------------------------------------------------------
fjc_judges["full_name_clean"] = fjc_judges["name_full"].apply(clean_name)
fjc_judges[["first","middle","last"]] = fjc_judges["full_name_clean"].apply(
    lambda n: pd.Series(split_name(n)))

# We'll need a mapping from nid to service records for date & court validation
fjc_service["court_clean"] = fjc_service["court_name"].apply(normalised_court)
fjc_service["nomination_date"] = pd.to_datetime(fjc_service["nomination_date"], errors="coerce")
fjc_service["commission_date"] = pd.to_datetime(fjc_service["commission_date"], errors="coerce")

In [None]:

def best_match(row):
    candidates = candidate_fjc_rows(row)
    if candidates.empty:
        return pd.NA, 0.0
    # Compute combined score: name similarity + court similarity + date proximity
    best_score = 0.0
    best_nid = pd.NA
    for _, cand in candidates.iterrows():
        name_score = fuzz.token_set_ratio(row["full_name_clean"], cand["full_name_clean"])
        # Use service records to find any matching nomination date
        entries = fjc_service[fjc_service["nid"] == cand["nid"]]
        date_score = 0
        court_score = 0
        if not entries.empty:
            # Smallest absolute diff in days
            diffs = (entries["nomination_date"] - row["nomination_date"]).abs().dt.days
            date_score = 100 - diffs.min() if diffs.notna().any() else 0
            # any court string overlap
            if row["court_clean"]:
                if any(row["court_clean"] in c for c in entries["court_clean"]):
                    court_score = 100
                else:
                    court_score = max(fuzz.partial_ratio(row["court_clean"], c) for c in entries["court_clean"])
        total = 0.6*name_score + 0.3*date_score + 0.1*court_score
        if total > best_score:
            best_score, best_nid = total, cand["nid"]
    return best_nid, round(best_score,1)

In [None]:
# Import the new filter_confirmed_nominees function
from nomination_predictor.features import (analyze_match_failures,
                                           filter_confirmed_nominees,
                                           load_simpler_dataframes)

# Load and prepare all dataframes
dfs = load_simpler_dataframes(RAW_DATA_DIR)
cong_nominees = dfs["cong_nominees"]  # This now has all the derived fields
fjc_judges = dfs["fjc_judges"]
fjc_service = dfs["fjc_service"]
cong_nominations = dfs["cong_nominations"]

# OPTIMIZATION: Filter to only confirmed nominees before matching
# This saves processing time by only matching nominees who were confirmed
confirmed_nominees = filter_confirmed_nominees(cong_nominees, cong_nominations)
print(f"Focusing on {len(confirmed_nominees)} confirmed nominees out of {len(cong_nominees)} total nominees")

# Only apply best_match to confirmed nominees
confirmed_nominees[["match_nid", "match_score"]] = confirmed_nominees.apply(
    best_match, axis=1, result_type="expand")

# Merge back with original dataframe to preserve all records
# Non-confirmed nominees will have NaN for match fields
cong_nominees = cong_nominees.merge(
    confirmed_nominees[["citation", "match_nid", "match_score"]], 
    on="citation", 
    how="left"
)

In [None]:

THRESHOLD = 80
matches = cong_nominees[cong_nominees["match_score"] >= THRESHOLD].copy()
print(f"Matched {len(matches)}/{len(cong_nominees)} nominees with score ≥ {THRESHOLD}")
matches.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_matches.csv", index=False)

In [None]:
## FIXME: decide whether to save as separate vs. overwrite in interim folder
## Save the cleaned interim datasets for downstream notebooks
#cong_nominees.to_csv(INTERIM_DATA_DIR / "congress_nominees_cleaned.csv", index=False)
#fjc_judges.to_csv(INTERIM_DATA_DIR / "fjc_judges_cleaned.csv", index=False)
#fjc_service.to_csv(INTERIM_DATA_DIR / "fjc_service_cleaned.csv", index=False)

In [None]:
from nomination_predictor.features import analyze_match_failures

THRESHOLD = 80
matches = cong_nominees[cong_nominees["match_score"] >= THRESHOLD].copy()
print(f"Matched {len(matches)}/{len(cong_nominees)} nominees with score ≥ {THRESHOLD}")

# Analyze unmatched records to understand why they didn't match
unmatched_df, reason_summary, examples = analyze_match_failures(cong_nominees, THRESHOLD)

# Display summary of failure reasons
print("\nFailure Reason Summary:")
display(reason_summary)

# Display a few examples of each failure type
print("\nExample records for each failure type:")
for reason, example_df in examples.items():
    print(f"\n{reason}:")
    display(example_df)

# Save both matched and unmatched datasets for further analysis
matches.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_matches.csv", index=False)
unmatched_df.to_csv(INTERIM_DATA_DIR / "congress_fjc_nominee_unmatched.csv", index=False)

In [None]:
## FIXME: decide whether to save as separate vs. overwrite in interim folder
## Save the cleaned interim datasets for downstream notebooks
#cong_nominees.to_csv(INTERIM_DATA_DIR / "congress_nominees_cleaned.csv", index=False)
#fjc_judges.to_csv(INTERIM_DATA_DIR / "fjc_judges_cleaned.csv", index=False)
#fjc_service.to_csv(INTERIM_DATA_DIR / "fjc_service_cleaned.csv", index=False)

## Combining FJC data

### Handling nominees' education and job history

Before we combine FJC data, we have to consider whether/how to handle judges' education, job history, age, ABA rating, etc., because the only other table in the FJC data which handles nid uniquely is "demographics," which are unchanging.
The simplest way to handle the non-unique-nid tables it would be to left-merge on "nid" and only take the most recently-dated row.  In most cases this would likely land on keeping the most prestigious degree or job.

However, it is entirely likely a judge's education or job history has changed substantially since their first nomination, and affected their qualifications for each later nomination.

All of these indicate to me that it's worth considering the judge's position, education, etc., not as of the most recent records available, but instead _as of when they were nominated._

That means we can't do a simple left-join of all of our FJC data.  Instead, we have to -- using a combination of names, court locations, and vacancy dates -- fuzzy-match to find which "nid" corresponds to each "citation" in the Congress data, as our way of bridging between FJC judges and congress' nominee data. Then use the "received date" for that citation as a cutoff date for when we lookup education and job records by "nid" -- so we can avoid mistakenly linking to a citation any employemnt & job records dated after that cutoff date.

Thankfully we do have the school, degree, and degree_year in the education record, for both their bachelors and their masters and their associate degree(s) and LLB and J.D. etc., so we can look that up.  The education dataframe even comes with a "sequence" number for each education record, which is an even easier-to-use indicator of chronological order than the degree_year for any given "nid" lookup for a judge.

Job history is more challenging to deal with because literally every row entry in that dataframe lists it uniquely, but we do have the data available.  On early attempts, it may be simplest to ignore it; then feature-engineer basic booleans for whether they did/didn't have experience in common-phrase-identifiable positions such as "Private practice" or "Attorney general" or "Navy" or "Army" etc.; eventually a parser can look for the year spreads listed there as a rough indicator of amounts of experience gleaned from each professional role.

In [None]:
# Left-joins all dataframes whose names start with "fjc", joining them on their columns named "nid"
# Warns if any shared column names contain non-identical data

from loguru import logger

from nomination_predictor.features import left_join_fjc_dataframes

# Execute the function with our dataframes
try:
    fjc_combined = left_join_fjc_dataframes(dfs)
    
    if fjc_combined is not None:
        logger.info(f"Successfully created combined FJC dataframe with {len(fjc_combined)} rows and {len(fjc_combined.columns)} columns")
        # Display the first few rows of the result
        fjc_combined.head()
    else:
        logger.error("Failed to create combined FJC dataframe")
except Exception as e:
    logger.error(f"Error joining FJC dataframes: {str(e)}")
    raise

### Build predecessor lookup table

In [None]:
# Create the predecessor lookup table
predecessor_lookup = get_predecessor_info(seat_timeline_df)
print(f"Created predecessor lookup: {len(predecessor_lookup)} records")

# Preview the predecessor lookup
print(predecessor_lookup.head())
all_dataframes['predecessor_lookup'] = predecessor_lookup