# Notebook 1 – Data Cleaning, Feature Engineering, & Entity Resolution
**Project:** Judicial Vacancy → Nomination/Confirmation Pipeline

*Initial draft generated via ChatGPT model o3 on 2025-07-12T02:40:38.399372Z*

In [None]:

import sys
from pathlib import Path

import pandas as pd
from loguru import logger
from rapidfuzz import fuzz, process

# Add the project root to the path so we can import our modules
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))


# Setup logging
logger.remove()  # Remove default handler
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{function}</cyan> - <level>{message}</level>", level="INFO")

## Load dataframes from Raw data folder

Start with loading simpler, non-JSON-containing CSV files

In [None]:
from nomination_predictor.config import INTERIM_DATA_DIR, RAW_DATA_DIR

# load FJC dataframes (and derived seat timeline)
fjc_judges = pd.read_csv(RAW_DATA_DIR / "judges.csv")
fjc_federal_judicial_service = pd.read_csv(RAW_DATA_DIR / "federal_judicial_service.csv")
fjc_demographics = pd.read_csv(RAW_DATA_DIR / "demographics.csv")
fjc_education = pd.read_csv(RAW_DATA_DIR / "education.csv")
fjc_other_federal_judicial_service = pd.read_csv(
    RAW_DATA_DIR / "other_federal_judicial_service.csv"
)
fjc_other_nominations_recess = pd.read_csv(RAW_DATA_DIR / "other_nominations_recess.csv")
fjc_professional_career = pd.read_csv(RAW_DATA_DIR / "professional_career.csv")
seat_timeline = pd.read_csv(RAW_DATA_DIR / "seat_timeline.csv")

Flatten JSON-containing congress DataFrames into separate DataFrames

In [None]:
from nomination_predictor.features import flatten_json_dataframe

# Load Congress API dataframes
cong_nominations_raw = pd.read_csv(RAW_DATA_DIR / "nominations.csv")
cong_nominees_raw = pd.read_csv(RAW_DATA_DIR / "nominees.csv")

cong_nominations = flatten_json_dataframe(
    df=cong_nominations_raw,
    json_col="nomination",  # column containing the JSON data
    max_list_index=10,      # maximum number of list items to extract
    separator="_"           # separator for nested keys
)

cong_nominees= flatten_json_dataframe(
    df=cong_nominees_raw,
    json_col="nominee",
    max_list_index=5
)

In [None]:
# Combine all dataframes into a single dictionary for bulk operations
# Start with FJC dataframes
dfs = {
    # FJC dataframes
    "fjc_judges": fjc_judges,
    "fjc_federal_judicial_service": fjc_federal_judicial_service,
    "fjc_demographics": fjc_demographics,
    "fjc_education": fjc_education,
    "fjc_other_federal_judicial_service": fjc_other_federal_judicial_service,
    "fjc_other_nominations_recess": fjc_other_nominations_recess,
    "fjc_professional_career": fjc_professional_career,
    "seat_timeline": seat_timeline,
    
    # Congress dataframes
    "cong_nominations": cong_nominations,
    "cong_nominees": cong_nominees,
}

In [None]:
# Print summary of available dataframes
print("Available dataframes:")
for name, df in dfs.items():
    print(f"- {name}: {len(df)} rows × {len(df.columns)} columns")

#### Quick peek at all loaded dataframes

In [None]:
logger.info("Checking for general shape and first handfuls of rows")
for name, df in dfs.items():
    print(f"{name:<35} → {df.shape}")
    print(df.head())  

In [None]:
logger.info("Checking for null values")
    
for name, df in dfs.items():
    print(df.isnull().sum())

## Data cleaning

### Normalize column names for DataFrames

In [None]:
print("=== Column Names Before ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

In [None]:
# call features.py's normalize_columns function on all DataFrames in dfs, and strip leading and trailing whitespace in all strings
from nomination_predictor.features import normalize_dataframe_columns

for name, df in dfs.items():
    df = normalize_dataframe_columns(df)
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    dfs[name] = df

In [None]:
print("=== Column Names After ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

### Left-merge nominees table onto nominations table
This will make the cong_noms dataframe we'll use for most of our congress data operations below in this notebook.

In [None]:
from nomination_predictor.features import merge_nominees_onto_nominations

try:
    # Assuming cong_nominations and cong_nominees dataframes are already loaded
    cong_noms = merge_nominees_onto_nominations(dfs["cong_nominations"], dfs["cong_nominees"])
    
    # Show sample of the merged dataframe
    display(cong_noms.head())
    
    # Report on the merge results
    logger.info(f"Original nominations shape: {cong_nominations.shape}")
    logger.info(f"Original nominees shape: {cong_nominees.shape}")
    logger.info(f"Merged dataframe shape: {cong_noms.shape}")
    
    dfs["cong_noms"] = cong_noms
    
except NameError:
    logger.error("Required dataframes (cong_nominations, cong_nominees) are not defined")
except Exception as e:
    logger.error(f"Error in merge process: {e}")

### Drop rows whose congressional citations end in -0
All of these I've seen either:

- lack strictly-necessary information such as nomination & confirmation dates,
- lack helpful information such as the person's name, or 
- whatever little information they do have indicates it's not for a position as a judge (e.g. for secretary of defense, assistant secretary to something-or-other-, etc.) 

In [None]:
from nomination_predictor.features import filter_dash_zero_citations

dfs["cong_noms"] = filter_dash_zero_citations(dfs["cong_noms"])

### Drop non-judge nominations based on position title

In [None]:
# Filter out non-judicial nominations using the function from features.py
from nomination_predictor.features import filter_non_judicial_nominations

# Define non-judicial titles to filter out
non_judicial_titles = [
    "Attorney", "Board", "Commission", "Director", "Marshal",
    "Assistant", "Representative", "Secretary of", "Member of"
]

dfs["cong_noms"] = filter_non_judicial_nominations(dfs["cong_noms"], non_judicial_titles=non_judicial_titles)

## Few new columns from straightforwardly-parsable data

In [None]:
from nomination_predictor.name_matching import fill_vacancy_reason_column

dfs["cong_noms"] = fill_vacancy_reason_column(dfs["cong_noms"])

In [None]:
# fill missing values from predecessor column
from nomination_predictor.name_matching import fill_predecessor_column

dfs["cong_noms"] = fill_predecessor_column(dfs["cong_noms"])

In [None]:
# create a new column in dfs["fjc_federal_judicial_service"] titled "fjc_biography_url"
# whose contents are simply the string "http://www.fjc.gov/node/" concatenated before whatever integer # can be read from 
# that same row's "nid" column (or empty string if nid is not an integer)

svc = dfs["fjc_federal_judicial_service"]

svc["fjc_biography_url"] = (
    "http://www.fjc.gov/node/" +
    pd.to_numeric(svc["nid"], errors="coerce")     # turn non-ints into NaN
      .dropna()                                    # keep only numeric nids
      .astype(int)                                 # cast to int for clean string
      .astype(str)   
)
dfs["fjc_federal_judicial_service"] = svc

## Not yet implemented: Merge only the useful columns of the "other federal judicial service" dataframe

In [None]:
# left-merges from dfs["fjc_other_federal_judicial_service"] onto dfs["fjc_federal_judicial_service"], but from the former's columns, doesn't left merge all and keep all of them; does the following:
# in the "other service" dataframe, prior to left-merging:
#   drop/ignore all of the columns whose column names stat with "unnamed_"
#   ignore column "judge_name"
#   rename column named "type" to "other_federal_judicial_service_type"
#   rename column named "sequence" to "other_federal_judicial_service_sequence"
#   rename column named "other_federal_judicial_service" to "other_federal_judicial_service_description"

#   adds column named "other_federal_judicial_service_start_date" and "other_federal_judicial_service_end_date" from "other_federal_judicial_service_description"...
# by the time I started looking at how to handle year-parsing, and consider the year-spans in light of the sequence number column in this dataframe, I decided it was higher priority to work on other aspects of this notebook

#### Merge the "Other Nominations" dataframe
In theory this should get us the fjc's perspective on more nominees who didn't get confirmed.  

In practice my runs of name-matching didn't find any unambiguous matches from this dataframe to the congress one, so I put this on hold to try another day, maybe after seeing whether the additional data would help or whether something had to be fixed/improved about the name matcher.

In [None]:
# left-merges from dfs["fjc_other_nominations_recess"] onto dfs["fjc_federal_judicial_service"]

### Convert date strings to datetime objects

In [None]:
# for any columns which contain certain keywords in their column name and contain string values, convert from string to datetime
datetime_related_keywords = ("date", "year", "month")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col for keyword in datetime_related_keywords) and df[col].dtype == "object":
            logger.info(f"Converting {col} to datetime for {name}")
            df[col] = pd.to_datetime(df[col], errors="coerce")

### Normalize all string values to make matching them later easier

In [None]:
from nomination_predictor.name_matching import normalize_text

keywords_which_denote_string_columns_to_normalize = ("court", "circuit", "district", "description", "name")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col.casefold() for keyword in keywords_which_denote_string_columns_to_normalize) and df[col].dtype == object:
            logger.info(F"Normalizing all values within column named {col} in {name}")
            df[col] = df[col].apply(normalize_text)

### Count and display unique values under each column

In [None]:
# display counts of unique values in DataFrame columns:
for name, df in dfs.items():
    for col in sorted(df.columns):
     print(f"{name} - {col}: {df[col].nunique()} unique values")

## Name-matching FJC judges to Congress.gov nominees

### For confirmed judges

#### Supplementing with additional columns to aid matching

In [None]:
# add a "full_name_concatenated" column to the fjc_federal_judicial_service dataframe which is composed by flipping its judge_name column values 
# from "lastname, firstname middleNameOrMiddleInitial (, optional comma and suffix)" to "firstname lastname middle suffix"
from nomination_predictor.features import \
    convert_judge_name_format_from_last_comma_first_to_first_then_last

try:
    dfs["fjc_federal_judicial_service"]["full_name_concatenated"] = dfs["fjc_federal_judicial_service"]["judge_name"].apply(convert_judge_name_format_from_last_comma_first_to_first_then_last)
    
    # Show some examples to verify the conversion
    sample = dfs["fjc_federal_judicial_service"][['judge_name', 'full_name_concatenated']].head(10)
    display(sample)
    
    # Count null values to check for any conversion failures
    null_count = dfs["fjc_federal_judicial_service"]["full_name_concatenated"].isna().sum()
    empty_count = (dfs["fjc_federal_judicial_service"]["full_name_concatenated"] == '').sum()
    
    if null_count > 0 or empty_count > 0:
        logger.warning(f"Found {null_count} null values and {empty_count} empty strings in the converted names.")
        
    logger.info(f"Successfully added 'full_name_concatenated' column to fjc_federal_judicial_service dataframe with {len(dfs["fjc_federal_judicial_service"])}) entries.")
    
except Exception as e:
    logger.error(f"Error creating full_name_concatenated column: {e}")
    # If there's an error, display the first few rows of fjc_federal_judicial_service to help diagnose
    logger.info("\nSample of fjc_federal_judicial_service dataframe:")
    display(dfs["fjc_federal_judicial_service"].head(3))
    logger.info(f"Columns available: {dfs["fjc_federal_judicial_service"].columns.tolist()}")

In [None]:
# add a "full_name_from_description" and a "location_of_origin_from_description" columns to the dfs["cong_noms"] dataframe which regex-captures the first segments of the same dfs["cong_noms"] dataframe row's "description" string, 
# i.e. captures name before the first appearances of the phrases ", of " or ", of the "
# and captures location from the second segment of the same dfs["cong_noms"] dataframe row's "description" string
# i.e. captures between the above-seen phrase ", of " or ", of the " through to the phrase ", to be "
# examples: 
# melissa damian, of florida, to be ...  gets captured into those new columns as "melissa damian" and "florida"
# nicole g. bernerr of maryland, to be united... gets captured into those new columns as "nicole g. bernerr" and "maryland"
# kirk edward sherriff, of california, to be united... gets captured into those new columns as "kirk edward sherriff" and "california"
# sherri malloy beatty-arthur, of the district of columbia, for... gets captured into those new columns as "sherri malloy beatty-arthur" and "district of columbia"

# Extract full_name_from_description and location_of_origin_from_description from description field
from nomination_predictor.features import extract_name_and_location_columns

# Apply the extraction function to cong_noms dataframe
if 'cong_noms' in dfs:
    dfs['cong_noms'] = extract_name_and_location_columns(dfs['cong_noms'])
    
    # Display sample results to verify extraction
    sample_cols = ['description', 'full_name_from_description', 'location_of_origin_from_description']
    display(dfs['cong_noms'][sample_cols].head(10))
    
    # Report extraction statistics
    total_rows = len(dfs['cong_noms'])
    name_filled = dfs['cong_noms']['full_name_from_description'].notna().sum()
    location_filled = dfs['cong_noms']['location_of_origin_from_description'].notna().sum()
    
    logger.info(f"Extracted names for {name_filled}/{total_rows} records ({name_filled/total_rows:.1%})")
    logger.info(f"Extracted locations for {location_filled}/{total_rows} records ({location_filled/total_rows:.1%})")
else:
    logger.error("Error: 'cong_noms' dataframe not found in dfs dictionary.")

#### Performing the matching operations

In [None]:
from nomination_predictor.name_matching import perform_exact_name_matching

results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices= perform_exact_name_matching(
    congress_df=dfs["cong_noms"],
    fjc_df=dfs["fjc_federal_judicial_service"],
    congress_name_col="full_name_from_description",
    fjc_name_col="judge_name"
)

In [None]:
# Show results
results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices.head()

In [None]:
# keep only *unambiguous* pairs
nid_map = (
    results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices[~results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices["ambiguous"]]        # drop rows still ambiguous
      .set_index("congress_index")["nid"]
)

In [None]:
# at long last, we have a way to bridge the gap between the congress.gov data and the fjc data

# we can now use the nid_map to add the nid column to the congress.gov data
dfs["cong_noms"]["nid"] = dfs["cong_noms"].index.to_series().map(nid_map)
cong_noms = dfs["cong_noms"]

### For unconfirmed judges

In practice, given the dataframes as I've got them as of typing this, this section doesn't find any remaining unconfirmed judges to match.

What this section _did_ accomplish was showing me that the presence of diacritical marks such as "ñ" or "é" in names was misleading the matching process.

Discovering and addressing that in much-earlier data-normalizing cells led to getting more matches in our confirmed-judges-matching notebook section.

#### Supplementing with additional columns to aid matching

In [None]:
from nomination_predictor.name_matching import prep_fjc_other

dfs["fjc_other_nominations_recess"] = prep_fjc_other(fjc_other_df=dfs["fjc_other_nominations_recess"])

In [None]:
fjc_other_supplemented =dfs["fjc_other_nominations_recess"]

#### Performing the matching operation

In [None]:
from nomination_predictor.features import link_unconfirmed_nominations

dfs["cong_noms"] = link_unconfirmed_nominations(dfs["cong_noms"], dfs["fjc_other_nominations_recess"])

## Combining the rest of the FJC data now that our congress dataframe has been enriched with FJC nid

### Handling nominees' education and job history

Before we combine FJC data, we have to consider whether/how to handle judges' education, job history, age, ABA rating, etc.  Most/all of the data in the "demographics" dataframe is unchanging over time, but that's very much _not_ true of the other dataframes.

The simplest way to handle it would be to left-merge on "nid" and only take the most recently-dated row, or row with the highest sequence number.  In most cases this would likely land on keeping the most prestigious degree or job.

However, it is entirely likely a judge's education or job history has changed substantially since their first nomination, and affected their qualifications for each later nomination.

All of these indicate to me that it's worth merging onto each row that judge's position, education, etc., not as of the most recent records available, but instead _as of when they were nominated._

That means we can't do a too-simple left-join of all of our FJC data.  Instead, now that we've done the step of matching NIDs to congress' data on nominations, we can use the "received date" for each congress citation as a cutoff date for when we lookup education and job records by "nid" -- so we can avoid mistakenly linking to a citation any employment & job records dated after that cutoff date.

Thankfully we do have the school, degree, and degree_year in the education record, for both their bachelors and their masters and their associate degree(s) and LLB and J.D. etc., so we can look that up.  The education dataframe even comes with a "sequence" number for each education record, which is another indicator of chronological order in addition to degree_year for any given "nid" lookup for a judge.

Job history is more challenging to deal with because literally every row entry in that dataframe lists it uniquely, but we do have the data available.  My earliest attempts to feature-engineer with it include looking for keywords in it, then creating boolean features for whether they did/didn't have experience in common-phrase-identifiable positions such as "Private practice" or "Attorney general" or "Navy" or "Army" etc. Theoretically a parser can look for the year spreads listed there as a rough indicator of amounts of experience gleaned from each professional role & when, but that may be too complicated for me to accomplish by the time I'm first presenting this work.

In [None]:
import pandas as pd

from nomination_predictor.time_aware_analysis import merge_congress_fjc

# Perform the time-aware merge
time_aware_merged_df = merge_congress_fjc(dfs["cong_noms"], dfs["fjc_judges"], dfs["fjc_demographics"],
                                                  dfs["fjc_education"], dfs["fjc_professional_career"], dfs["fjc_federal_judicial_service"])
display(time_aware_merged_df.head(5))

### Feature engineering 

In [None]:
from datetime import date

from nomination_predictor.time_aware_analysis import (
    congress_number, congress_session, days_into_current_term,
    days_until_next_midterm_election, days_until_next_presidential_election,
    fill_missing_appointing_presidents, president_number,
    presidential_term_index)

df = time_aware_merged_df.copy()
df["receiveddate"] = pd.to_datetime(df["receiveddate"])   # ensure datetime

In [None]:
# presidency- and elections-timeline-related
df["pres_term_idx"]  = df["receiveddate"].apply(presidential_term_index)
df["days_into_pres_term"] = df["receiveddate"].apply(days_into_current_term)
df["days_to_next_pres_election"] = df["receiveddate"].apply(days_until_next_presidential_election)
df["days_to_next_midterm_election"]  = df["receiveddate"].apply(days_until_next_midterm_election)
df["congress_num"] = df["receiveddate"].apply(congress_number)
df["congress_session"] = df["receiveddate"].apply(congress_session)

In [None]:
df = fill_missing_appointing_presidents(df)

Among the hypotheses is that older judges tend to get approved faster because there's not as much concern they'll live long enough to have as much of a total impact over their time in office.

In [None]:
# Age at nomination
from datetime import datetime, timedelta

# Reference date for “future” check
yesterday = (pd.Timestamp.today().normalize() - pd.Timedelta(days=1))

#  Mask rows that have *all three* components
complete_date_mask = (
    df['birth_day'].notna()  &
    df['birth_month'].notna() &
    df['birth_year'].notna()
)

#  Initialise column
df['birth_date'] = pd.NaT

#  Build datetime only for rows that passed the mask
tmp_birth = pd.to_datetime(
    {
        'year':  df.loc[complete_date_mask, 'birth_year'],
        'month': df.loc[complete_date_mask, 'birth_month'],
        'day':   df.loc[complete_date_mask, 'birth_day']
    },
    errors='coerce'           # invalid combos → NaT
)

# Keep the value only if they were at least "born yesterday, i.e. ignore misfiled future dates
tmp_birth = tmp_birth.where(tmp_birth < yesterday, pd.NaT)

df.loc[complete_date_mask, 'birth_date'] = tmp_birth

df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

df['age_at_nom'] = (
    (df['receiveddate'] - df['birth_date']).dt.days // 365.25
).astype('Int64')

In [None]:
# identify seat level
df["seat_level"] = (
    df["description"] # we do have court_type_(1), # court_type_(2), etc. columns, but for this coarse an analysis it's simpler and accurate-enough to look through the nomination description
      .str.lower()
      .str.extract(r"(supreme|circuit|district|(?<=\s)tax|international|appeals)") # insists on whitespace before "tax" so we can tell "tax" or "taxation" etc. are a standalone word
      .fillna("other")
)

### Not-yet-implemented analyses ideas:

In [None]:
# Partisan mismatch: 1 if president_party != party__who_appointed_predecessor
# this would require an additional step of linking predecessor name to nid, and way of looking up when the judge with that nid had been in service, possibly even needing to be a date-and-location-aware analysis

#party_map = {47: "R", 46: "D", 45: "R", 44: "D", 43: "R", 42: "D", 41: "R"}  # extend list
#df["pres_party"] = df["receiveddate"].apply(lambda d: party_map.get(president_number(d), None))
#df["partisan_mismatch"] = (
#    (df["pres_party"].notna()) &
#    (df["pres_party"] != df["party_of_appointing_president"])
#)

In [None]:

# skipping this one because I think we'd get more and/or richer info out of it if we had a more-successful linkage of the unconfirmed nomination rows between congress and fjc's data
# #Count prior failed nominations for this seat_id (if column present)

#if "other_nominations_count" not in df.columns and "seat_id" in df.columns:
#    prior_counts = (
#        df.groupby("seat_id").cumcount()  # number seen so far for that seat
#    )
#    df["num_prior_failed_noms"] = prior_counts
#
#display(df.head())
#feature_engineered_df = df.copy()

## Saving interim dataframes

In [None]:
# Save to interim data
time_aware_merged_df.to_csv(INTERIM_DATA_DIR / "merged_time_aware.csv", index=False)
feature_engineered_df.to_csv("data/interim/feature_engineered.csv", index=False)

In [None]:
# Save extracted tables to interim directory
for name, df in dfs.items():
    if len(df) > 0:  # Only save non-empty DataFrames
        output_path = INTERIM_DATA_DIR / f"{name}.csv"
        df.to_csv(output_path, index=False)
        print(f"Saved {len(df)} records to {output_path}")