# Notebook 1 – Data Cleaning & Feature Engineering
**Project:** Judicial Vacancy → Nomination/Confirmation Pipeline

*Initial draft generated via ChatGPT model o3 on 2025-07-12T02:40:38.399372Z*

In [None]:

import sys
from pathlib import Path

import pandas as pd
from loguru import logger

# Add the project root to the path so we can import our modules
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))


# Setup logging
logger.remove()  # Remove default handler
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{function}</cyan> - <level>{message}</level>", level="INFO")

1

# Load dataframes from Raw data folder

Start with loading simpler, non-JSON-containing CSV files

In [None]:
from nomination_predictor.config import INTERIM_DATA_DIR, RAW_DATA_DIR

# load FJC dataframes (and derived seat timeline)
fjc_judges = pd.read_csv(RAW_DATA_DIR / "judges.csv")
fjc_federal_judicial_service = pd.read_csv(RAW_DATA_DIR / "federal_judicial_service.csv")
fjc_demographics = pd.read_csv(RAW_DATA_DIR / "demographics.csv")
fjc_education = pd.read_csv(RAW_DATA_DIR / "education.csv")
fjc_other_federal_judicial_service = pd.read_csv(
    RAW_DATA_DIR / "other_federal_judicial_service.csv"
)
fjc_other_nominations_recess = pd.read_csv(RAW_DATA_DIR / "other_nominations_recess.csv")
fjc_professional_career = pd.read_csv(RAW_DATA_DIR / "professional_career.csv")
#seat_timeline = pd.read_csv(RAW_DATA_DIR / "seat_timeline.csv")

[32m2025-07-17 21:07:53.197[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m103[0m - [1mProject root: /home/wsl2ubuntuuser/nomination_predictor[0m
[32m2025-07-17 21:07:53.199[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m127[0m - [1mConfiguration loaded[0m


## Combine all dataframes into a single dictionary for bulk operations

In [None]:
# Combine all dataframes into a single dictionary for bulk operations


# Start with FJC dataframes; we'll add Congress ones soon
dfs = {
    # FJC dataframes
    "fjc_judges": fjc_judges,
    "fjc_federal_judicial_service": fjc_federal_judicial_service,
    "fjc_demographics": fjc_demographics,
    "fjc_education": fjc_education,
    "fjc_other_federal_judicial_service": fjc_other_federal_judicial_service,
    "fjc_other_nominations_recess": fjc_other_nominations_recess,
    "fjc_professional_career": fjc_professional_career,
    #"seat_timeline": seat_timeline,
    
    # Congress dataframes
}

## Load & immediately drop duplicated rows from congress API data

If we made the same request multiple times and got the same response (e.g. from a software design oversight, or pausing & resuming/mashing together downloads made on separate occasions with possible overlap)

It's easier to find this kind of duplicate now vs. after flattening.

In [None]:
from nomination_predictor.features import flatten_json_dataframe

# Load Congress API dataframes
cong_nominations_raw = pd.read_csv(RAW_DATA_DIR / "nominations.csv")
cong_nominees_raw = pd.read_csv(RAW_DATA_DIR / "nominees.csv")

dfs["cong_nominations"] = cong_nominations_raw
dfs["cong_nominees"] = cong_nominees_raw

In [None]:
# ------------------------------------------------------------------
# 1. Identify the rows that *would* be dropped
dupe_mask = dfs["cong_nominations"].duplicated(subset=["nomination", "request"], keep="first")
dupes      = dfs["cong_nominations"].loc[dupe_mask].copy()

# ------------------------------------------------------------------
# 2. Show a compact summary
print(f"Rows flagged as duplicates: {len(dupes)}")
display(
    dupes.sort_values(["nomination", "request"])
         .head(20)   # show first 20; remove .head() to see all
)

# Optional: see how many duplicates per citation
dup_counts = (
    dfs["cong_nominations"]
      .loc[dupe_mask, "nomination"]
      .value_counts()
      .head(10)
)
print("\nTop duplicate records:")
display(dup_counts)

Rows flagged as duplicates: 189


Unnamed: 0,nomination,request,retrieval_date,is_full_detail
4956,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '100', 'contentType': 'applicatio...",2025-07-12,True
4953,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '100', 'contentType': 'applicatio...",2025-07-12,True
4725,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '101', 'contentType': 'applicatio...",2025-07-12,True
4494,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '102', 'contentType': 'applicatio...",2025-07-12,True
4495,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '102', 'contentType': 'applicatio...",2025-07-12,True
1675,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '113', 'contentType': 'applicatio...",2025-07-12,True
1676,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '113', 'contentType': 'applicatio...",2025-07-12,True
1146,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '115', 'contentType': 'applicatio...",2025-07-12,True
20,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '118', 'contentType': 'applicatio...",2025-07-12,True
140,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '118', 'contentType': 'applicatio...",2025-07-12,True



Top duplicate records:


nomination
{'actions': {'count': 2, 'url': 'https://api.congress.gov/v3/nomination/101/1181-0/actions?format=json'}, 'authorityDate': '1989-01-03', 'citation': 'PN1181-0', 'committees': {'count': 1, 'url': 'https://api.congress.gov/v3/nomination/101/1181-0/committees?format=json'}, 'congress': 101, 'description': 'The following-named persons to be Associate Judges of the Superior Court of the District of Columbia for the term of fifteen years. (New Positions) \tMary Ellen Abrecht, of the District of Columbia. \tKaye K. Christian, of the District of Columbia. \tFrederick D. Dorsey, of the District of Columbia. \tEllen Segal Huvelle, of the District of Columbia. \tJose M. Lopez, of the District of Columbia. \tJoan Z. McAvoy, of the District of Columbia. \tGregory E. Mize, of the District of Columbia. \tPatricia A. Wynn, of the District of Columbia.', 'latestAction': {'actionDate': '1990-08-02', 'text': 'Committee on Governmental Affairs. Ordered to be reported favorably.'}, 'nominationTy

Then we can remove the duplicates, keeping the one with the most recent retrieval_date on the presumption it's the one most likely to have been corrected for accuracy server-side.

In [None]:
print(f"shape before checking for & dropping duplicated congressional nomination records is { dfs["cong_nominations"].shape}")

# First sort by retrieval_date in ascending order
# Then drop duplicates keeping the last occurrence (which will be the most recent date)
dfs["cong_nominations"] = (dfs["cong_nominations"]
                          .sort_values("retrieval_date")
                          .drop_duplicates(subset=["nomination", "request"], keep='last'))

print(f"shape after checking for & dropping duplicated congressional nomination records is { dfs["cong_nominations"].shape}")

shape before checking for & dropping duplicated congressional nomination records is (5746, 4)
shape after checking for & dropping duplicated congressional nomination records is (5557, 4)


Same logic for nominee dataframe, target column just has a different name

TODO: if this operation works well, refactor it to features.py taking dataframe and target column names as inputs

In [None]:
# ------------------------------------------------------------------
# 1. Identify the rows that *would* be dropped
dupe_mask = dfs["cong_nominees"].duplicated(subset=["nominee", "request"], keep="first")
dupes      = dfs["cong_nominees"].loc[dupe_mask].copy()

# ------------------------------------------------------------------
# 2. Show a compact summary
print(f"Rows flagged as duplicates: {len(dupes)}")
display(
    dupes.sort_values(["nominee", "request"])
         .head(20)   # show first 20; remove .head() to see all
)

# Optional: see how many duplicates per citation
dup_counts = (
    dfs["cong_nominees"]
      .loc[dupe_mask, "nominee"]
      .value_counts()
      .head(10)
)
print("\nTop duplicate records:")
display(dup_counts)

Rows flagged as duplicates: 155


Unnamed: 0,nominee,request,retrieval_date
119,"{'nominees': [{'firstName': 'Adam', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
778,"{'nominees': [{'firstName': 'Adam', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
336,"{'nominees': [{'firstName': 'Almo', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
492,"{'nominees': [{'firstName': 'Andrew', 'lastNam...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1562,"{'nominees': [{'firstName': 'Arthur', 'lastNam...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
111,"{'nominees': [{'firstName': 'Benjamin', 'lastN...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1022,"{'nominees': [{'firstName': 'Bradley', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1106,"{'nominees': [{'firstName': 'Bradley', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1108,"{'nominees': [{'firstName': 'Brendan', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
4914,"{'nominees': [{'firstName': 'Bruce', 'lastName...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13



Top duplicate records:


nominee
{'nominees': [{'firstName': 'Mary', 'lastName': 'Abrecht', 'middleName': 'Ellen', 'ordinal': 1, 'state': 'DC'}], 'pagination': {'count': 1}, 'request': {'congress': '101', 'contentType': 'application/json', 'format': 'json', 'number': '1181-0'}}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   7
{'nominees': [{'firstName': 'Robert Samuel', 'lastName': 'Tignor', 'ordinal': 7, 'state': 'DC'}, {'firstName': 'Emmet G.', 'lastName': 'Sullivan', 'ordinal': 6, 'state': 'DC'}, {'firstName': 'Robert Isaac', 'lastName': 'Richter', 'ordinal': 5, 's

In [None]:
print(f"shape before checking for & dropping duplicated congressional nominee records is { dfs["cong_nominees"].shape}")

# First sort by retrieval_date in ascending order
# Then drop duplicates keeping the last occurrence (which will be the most recent date)
dfs["cong_nominees"] = (dfs["cong_nominees"]
                          .sort_values("retrieval_date")
                          .drop_duplicates(subset=["nominee", "request"], keep='last'))

print(f"shape after checking for & dropping duplicated congressional nominee records is { dfs["cong_nominees"].shape}")

shape before checking for & dropping duplicated congressional nominee records is (5672, 3)
shape after checking for & dropping duplicated congressional nominee records is (5517, 3)


# Flatten JSON-containing congress DataFrames into tabular form

In [None]:

dfs["cong_nominations"] = flatten_json_dataframe(
    df=dfs["cong_nominations"],
    json_col="nomination",  # column containing the JSON data
    max_list_index=10,      # maximum number of list items to extract
    separator="_"           # separator for nested keys
)

dfs["cong_nominees"]= flatten_json_dataframe(
    df=dfs["cong_nominees"],
    json_col="nominee",
    max_list_index=5
)

[32m2025-07-17 21:07:54.278[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m324[0m - [1mFlattening JSON data from column 'nomination' in 5557 rows[0m
[32m2025-07-17 21:07:58.604[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m350[0m - [1mFlattening complete. Original columns: 4, New columns: 37[0m
[32m2025-07-17 21:07:58.607[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m324[0m - [1mFlattening JSON data from column 'nominee' in 5517 rows[0m
[32m2025-07-17 21:08:01.423[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m350[0m - [1mFlattening complete. Original columns: 3, New columns: 34[0m


In [None]:
# Print summary of available dataframes
print("Available dataframes:")
for name, df in dfs.items():
    print(f"- {name}: {len(df)} rows × {len(df.columns)} columns")

Available dataframes:
- fjc_judges: 4022 rows × 201 columns
- fjc_federal_judicial_service: 4720 rows × 30 columns
- fjc_demographics: 4022 rows × 18 columns
- fjc_education: 8040 rows × 6 columns
- fjc_other_federal_judicial_service: 611 rows × 31 columns
- fjc_other_nominations_recess: 828 rows × 4 columns
- fjc_professional_career: 19003 rows × 4 columns
- cong_nominations: 5557 rows × 37 columns
- cong_nominees: 5517 rows × 34 columns


## Quick peek at all loaded dataframes

In [None]:
logger.info("Checking for general shape and first handfuls of rows")
for name, df in dfs.items():
    print(f"{name:<35} → {df.shape}")
    print(df.head())  
    print(df.info())

[32m2025-07-17 21:08:01.459[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking for general shape and first handfuls of rows[0m
fjc_judges                          → (4022, 201)
        nid       jid last_name first_name middle_name suffix  birth_month  \
0  13761857  13761857   Abelson       Adam         Ben    NaN          NaN   
1   1393931      3419    Abrams     Ronnie                             NaN   
2   1376976         1   Abruzzo    Matthew          T.                 4.0   
3  13651551  13651551     Abudu      Nancy       Gbana    NaN          NaN   
4   1376981         2   Acheson     Marcus      Wilson                 6.0   

   birth_day birth_year  birth_city  ... degree_(3)  degree_year_(3)  \
0        NaN       1982   Cleveland  ...        NaN              NaN   
1        NaN       1968    New York  ...        NaN              NaN   
2       30.0       1889    Brooklyn  ...        NaN              NaN   
3        NaN       1974  Ale

In [None]:
logger.info("Checking for null values")
    
for name, df in dfs.items():
    print(df.isnull().sum())

[32m2025-07-17 21:08:01.606[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking for null values[0m


nid                                         0
jid                                         0
last_name                                   0
first_name                                  0
middle_name                                35
                                         ... 
school_(5)                               4017
degree_(5)                               4018
degree_year_(5)                          4017
professional_career                         4
other_nominations/recess_appointments    3307
Length: 201, dtype: int64
nid                                     0
sequence                                0
judge_name                              0
court_type                              0
court_name                              0
appointment_title                       0
appointing_president                    0
party_of_appointing_president          39
reappointing_president               4710
party_of_reappointing_president      4710
aba_rating                           1950
seat_i

# Data cleaning

## Drop duplicated rows (if any) from FJC data

In [None]:
for name, df in dfs.items():
    if name.startswith("fjc_") and name not in ("fjc_judges", "fjc_demographics", "fjc_other_nominations_recess"):
        dfs[name] = dfs[name].drop_duplicates(subset=["nid", "sequence"])
        print(f"shape of {name} after checking for & dropping duplicated nid sequenced items is { dfs[name].shape}")

shape of fjc_federal_judicial_service after checking for & dropping duplicated nid sequenced items is (4720, 30)
shape of fjc_education after checking for & dropping duplicated nid sequenced items is (8040, 6)
shape of fjc_other_federal_judicial_service after checking for & dropping duplicated nid sequenced items is (611, 31)
shape of fjc_professional_career after checking for & dropping duplicated nid sequenced items is (19003, 4)


## Drop rows whose congressional citations end in -0
All of these I've seen either:

- lack strictly-necessary information such as nomination & confirmation dates,
- lack helpful information such as the person's name, our most straightforward means of linking to FJC data or 
- whatever little information they do have indicates it's not for a position as a judge (e.g. for secretary of defense, assistant secretary to something-or-other-, etc.) 

In [None]:
from nomination_predictor.features import filter_dash_zero_citations

dfs["cong_nominations"] = filter_dash_zero_citations(dfs["cong_nominations"])

[32m2025-07-17 21:08:01.738[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_dash_zero_citations[0m:[36m227[0m - [1mFound 70 citations ending with '-0'[0m
[32m2025-07-17 21:08:01.743[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_dash_zero_citations[0m:[36m233[0m - [1mRemoved 70/5557 records with '-0' citations[0m


## Drop multi-nominee columns

All of these seem associated with non-judge roles (e.g. representatives to the U.N.) and would make our main working dataframe too sparse

In [None]:
multi_nominee_cols_to_drop = dfs["cong_nominations"].filter(regex=r"nominees_[1-9]").columns
dfs["cong_nominations"] = dfs["cong_nominations"].drop(columns=multi_nominee_cols_to_drop)
print("Remaining nomination columns: ", dfs["cong_nominations"].columns)

multi_nominee_cols_to_drop = dfs["cong_nominees"].filter(regex=r"nominees_[1-9]").columns
dfs["cong_nominees"] = dfs["cong_nominees"].drop(columns=multi_nominee_cols_to_drop)
print("Remaining nominee columns: ", dfs["cong_nominees"].columns)

Remaining nomination columns:  Index(['request', 'retrieval_date', 'is_full_detail', 'actions_count',
       'actions_url', 'authorityDate', 'citation', 'committees_count',
       'committees_url', 'congress', 'description', 'hearings_count',
       'hearings_url', 'latestAction_actionDate', 'latestAction_text',
       'nominationType_isCivilian', 'nominees_0_nomineeCount',
       'nominees_0_ordinal', 'nominees_0_organization',
       'nominees_0_positionTitle', 'nominees_0_url', 'number', 'partNumber',
       'receivedDate', 'updateDate', 'executiveCalendarNumber', 'isList',
       'nominees_0_division', 'nominees_0_introText', 'isPrivileged'],
      dtype='object')
Remaining nominee columns:  Index(['request', 'retrieval_date', 'nominees_0_firstName',
       'nominees_0_lastName', 'nominees_0_middleName', 'nominees_0_ordinal',
       'nominees_0_state', 'pagination_count', 'request_congress',
       'request_contentType', 'request_format', 'request_number',
       'nominees_0_predec

## Dropping unhelpfully uninformative columns

I define these as columns that are fully populated with the same value and wouldn't help modeling.  This still keeps columns if most of it is missing but the one value which is present is the same wherever it's present.

In [None]:
from nomination_predictor.features import \
    drop_unhelpfully_uninformative_columns

for name, df in dfs.items():
    df = drop_unhelpfully_uninformative_columns(df)
    dfs[name] = df

Columns with limited unique values:
  - 2nd_service_as_chief_judge,_begin_(2): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_begin_(3): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_begin_(4): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_begin_(5): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_begin_(6): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_end_(2): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_end_(3): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_end_(4): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_end_(5): 100% missing values - DROPPING
  - 2nd_service_as_chief_judge,_end_(6): 100% missing values - DROPPING
  - aba_rating_(4): 1 unique non-null value 'Well Qualified' (0.0% of rows) - KEEPING
  - aba_rating_(5): 100% missing values - DROPPING
  - aba_rating_(6): 100% missing values - DROPPING
  - appointing_president_(5): 1 unique non-nul

  - commission_date_(5): 1 unique non-null value '1949-02-02' (0.0% of rows) - KEEPING
  - commission_date_(6): 1 unique non-null value '1949-02-02' (0.0% of rows) - KEEPING
  - committee_action_date_(5): 1 unique non-null value '1949-01-27' (0.0% of rows) - KEEPING
  - committee_action_date_(6): 1 unique non-null value '1949-01-27' (0.0% of rows) - KEEPING
  - committee_referral_date_(5): 1 unique non-null value '1949-01-13' (0.0% of rows) - KEEPING
  - committee_referral_date_(6): 1 unique non-null value '1949-01-13' (0.0% of rows) - KEEPING
  - confirmation_date_(5): 1 unique non-null value '1949-01-31' (0.0% of rows) - KEEPING
  - confirmation_date_(6): 1 unique non-null value '1949-01-31' (0.0% of rows) - KEEPING
  - court_name_(5): 1 unique non-null value 'U.S. District Court for the Eastern District of Missouri' (0.0% of rows) - KEEPING
  - court_name_(6): 1 unique non-null value 'U.S. District Court for the Western District of Missouri' (0.0% of rows) - KEEPING
  - court_type_(

## Normalize column names for DataFrames

In [None]:
print("=== Column Names Before ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

=== Column Names Before ===
fjc_judges                          → ['nid', 'jid', 'last_name', 'first_name', 'middle_name', 'suffix', 'birth_month', 'birth_day', 'birth_year', 'birth_city', 'birth_state', 'death_month', 'death_day', 'death_year', 'death_city', 'death_state', 'gender', 'race_or_ethnicity', 'court_type_(1)', 'court_name_(1)', 'appointment_title_(1)', 'appointing_president_(1)', 'party_of_appointing_president_(1)', 'reappointing_president_(1)', 'party_of_reappointing_president_(1)', 'aba_rating_(1)', 'seat_id_(1)', 'statute_authorizing_new_seat_(1)', 'recess_appointment_date_(1)', 'nomination_date_(1)', 'committee_referral_date_(1)', 'hearing_date_(1)', 'judiciary_committee_action_(1)', 'committee_action_date_(1)', 'senate_vote_type_(1)', 'ayes/nays_(1)', 'confirmation_date_(1)', 'commission_date_(1)', 'service_as_chief_judge,_begin_(1)', 'service_as_chief_judge,_end_(1)', '2nd_service_as_chief_judge,_begin_(1)', '2nd_service_as_chief_judge,_end_(1)', 'senior_status_date_(

In [None]:
# call features.py's normalize_columns function on all DataFrames in dfs, and strip leading and trailing whitespace in all strings
from nomination_predictor.features import normalize_dataframe_columns

for name, df in dfs.items():
    df = normalize_dataframe_columns(df)
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    dfs[name] = df

In [None]:
print("=== Column Names After ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

=== Column Names After ===
fjc_judges                          → ['nid', 'jid', 'last_name', 'first_name', 'middle_name', 'suffix', 'birth_month', 'birth_day', 'birth_year', 'birth_city', 'birth_state', 'death_month', 'death_day', 'death_year', 'death_city', 'death_state', 'gender', 'race_or_ethnicity', 'court_type_(1)', 'court_name_(1)', 'appointment_title_(1)', 'appointing_president_(1)', 'party_of_appointing_president_(1)', 'reappointing_president_(1)', 'party_of_reappointing_president_(1)', 'aba_rating_(1)', 'seat_id_(1)', 'statute_authorizing_new_seat_(1)', 'recess_appointment_date_(1)', 'nomination_date_(1)', 'committee_referral_date_(1)', 'hearing_date_(1)', 'judiciary_committee_action_(1)', 'committee_action_date_(1)', 'senate_vote_type_(1)', 'ayes/nays_(1)', 'confirmation_date_(1)', 'commission_date_(1)', 'service_as_chief_judge,_begin_(1)', 'service_as_chief_judge,_end_(1)', '2nd_service_as_chief_judge,_begin_(1)', '2nd_service_as_chief_judge,_end_(1)', 'senior_status_date_(1

## Left-merge nominees table onto nominations table
This will make the cong_noms dataframe we'll use for most of our congress data operations below in this notebook.

In [None]:
from nomination_predictor.features import merge_nominees_onto_nominations

dfs["cong_noms"] = pd.DataFrame() # initialize at this scope so the one created in the below "try" block persists for further-down notebook cells


try:
    # Assuming cong_nominations and cong_nominees dataframes are already loaded
    dfs["cong_noms"] = merge_nominees_onto_nominations(dfs["cong_nominations"], dfs["cong_nominees"])
    
    
    # Report on the merge results
    logger.info(f"Original nominations shape: {dfs['cong_nominations'].shape}")
    logger.info(f"Original nominees shape: {dfs['cong_nominees'].shape}")
    logger.info(f"Merged dataframe shape: {dfs['cong_noms'].shape}")
    
except Exception as e:
    logger.error(f"Error in merge process: {e}")
    
# Show sample of the merged dataframe
display(dfs["cong_noms"].head())

[32m2025-07-17 21:08:03.295[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m495[0m - [1mExtracted 5517 URLs from nominees request column (100.0% of rows)[0m
[32m2025-07-17 21:08:03.298[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m519[0m - [1mNominations dataframe has 5487 non-null URLs (100.0% of rows)[0m
[32m2025-07-17 21:08:03.313[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m535[0m - [1mMerged dataframe has 5487 rows[0m
[32m2025-07-17 21:08:03.313[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m536[0m - [1mSuccessfully matched 5487 nominations with nominees (100.0%)[0m
[32m2025-07-17 21:08:03.315[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mOriginal nominations shape: (5487, 24)[0m


Unnamed: 0,request,actions_count,actions_url,authoritydate,citation,committees_count,committees_url,congress,description,hearings_count,...,nominees_0_firstname,nominees_0_lastname,nominees_0_middlename,nominees_0_ordinal,nominees_0_state,pagination_count,request_congress,request_number,nominees_0_predecessorname,nominees_0_suffix
0,"{'congress': '118', 'contentType': 'applicatio...",6,https://api.congress.gov/v3/nomination/118/201...,2025-05-12,PN2013,1.0,https://api.congress.gov/v3/nomination/118/201...,118,"Nicholas George Miranda, of the District of Co...",1.0,...,Nicholas,Miranda,George,1,DC,1,118,2013,,
1,"{'congress': '104', 'contentType': 'applicatio...",2,https://api.congress.gov/v3/nomination/104/556...,,PN556,1.0,https://api.congress.gov/v3/nomination/104/556...,104,"Bruce W. Greer, of Florida, to be United State...",,...,Bruce,Greer,W.,1,FL,1,104,556,James W. Kehoe,
2,"{'congress': '104', 'contentType': 'applicatio...",2,https://api.congress.gov/v3/nomination/104/577...,,PN577,1.0,https://api.congress.gov/v3/nomination/104/577...,104,"Gerald M. Shea, of the District of Columbia, t...",,...,Gerald,Shea,M.,1,DC,1,104,577,,
3,"{'congress': '104', 'contentType': 'applicatio...",5,https://api.congress.gov/v3/nomination/104/581...,,PN581,1.0,https://api.congress.gov/v3/nomination/104/581...,104,"Joseph Francis Baca, of New Mexico, to be a Me...",,...,Joseph,Baca,Francis,1,NM,1,104,581,,
4,"{'congress': '104', 'contentType': 'applicatio...",5,https://api.congress.gov/v3/nomination/104/582...,,PN582,1.0,https://api.congress.gov/v3/nomination/104/582...,104,"Bruce D. Black, of New Mexico, to be United St...",,...,Bruce,Black,D.,1,NM,1,104,582,Juan Guerrero Burciaga,


## Drop non-judge nominations based on organization or position title

I was curious whether the `nominee_0_organization` field expressed where the nominee was coming from, vs. where they were being nominated to.

Because if it was where they're being nominated from, then deleting rows with that outside the judiciary, tax courts, etc. would be erasure of people getting their first judgeship nomination.

Whereas if it was where they're being nominated to, then deleting rows with that outside the judiciary, tax courts, etc. would be a quick, simple way to clear out non-judge roles.

To check, for this...

In [None]:
# display rows whose "description" field contains the word "judge", but whose "nominees_0_organization" field does not contain either of the words "Judiciary" or "Court"
to_display = dfs["cong_noms"][dfs["cong_noms"]["description"].str.contains("judge", case=False, na=False) & ~dfs["cong_noms"]["nominees_0_organization"].str.contains("Judiciary", case=False, na=False) & ~dfs["cong_noms"]["nominees_0_organization"].str.contains("Court", case=False, na=False)]
to_display.head()

Unnamed: 0,request,actions_count,actions_url,authoritydate,citation,committees_count,committees_url,congress,description,hearings_count,...,nominees_0_firstname,nominees_0_lastname,nominees_0_middlename,nominees_0_ordinal,nominees_0_state,pagination_count,request_congress,request_number,nominees_0_predecessorname,nominees_0_suffix
1127,"{'congress': '106', 'contentType': 'applicatio...",6,https://api.congress.gov/v3/nomination/106/127...,,PN127,1.0,https://api.congress.gov/v3/nomination/106/127...,106,"Robert A. Katzmann, of New York, to be United ...",,...,Robert,Katzmann,A.,1,NY,1,106,127,Jon O. Newman,
2845,"{'congress': '115', 'contentType': 'applicatio...",5,https://api.congress.gov/v3/nomination/115/246...,,PN2464,1.0,https://api.congress.gov/v3/nomination/115/246...,115,"Lisa M. Schenck, of Virginia, to be a Judge of...",,...,Lisa,Schenck,M.,1,VA,1,115,2464,,
2996,"{'congress': '116', 'contentType': 'applicatio...",6,https://api.congress.gov/v3/nomination/116/209...,,PN209,1.0,https://api.congress.gov/v3/nomination/116/209...,116,"Lisa M. Schenck, of Virginia, to be a Judge of...",,...,Lisa,Schenck,M.,1,VA,1,116,209,,
5423,"{'congress': '112', 'contentType': 'applicatio...",5,https://api.congress.gov/v3/nomination/112/111...,,PN1119,1.0,https://api.congress.gov/v3/nomination/112/111...,112,"William B. Pollard, III, of New York, to be a ...",,...,William,Pollard,B.,1,NY,1,112,1119,,III
5424,"{'congress': '112', 'contentType': 'applicatio...",5,https://api.congress.gov/v3/nomination/112/112...,,PN1120,1.0,https://api.congress.gov/v3/nomination/112/112...,112,"Scott L. Silliman, of North Carolina, to be a ...",,...,Scott,Silliman,L.,1,NC,1,112,1120,,


...so that told me the organization field is where they're coming from, not where they're being nominated to.  And includes a few people trying to move from places like Department of Defense (or who just plain had the organization field missing from our data) to get into a position as a judge.  So the Organization field wasn't an accurate selector for getting rid of non-judge rows.

Instead will try to use the nominee_0_positiontitle field and the description text to filter out non-judicial nominations:

In [None]:
# Filter out non-judicial nominations using the function from features.py
from nomination_predictor.features import filter_non_judicial_nominations

dfs["cong_noms"] = filter_non_judicial_nominations(dfs["cong_noms"])

[32m2025-07-17 21:08:03.477[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_non_judicial_nominations[0m:[36m193[0m - [1mFound 2383 rows with non-judicial titles[0m
[32m2025-07-17 21:08:03.482[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_non_judicial_nominations[0m:[36m199[0m - [1mRemoved 2383/5487 corresponding records[0m


## Populate new, cleaner columns from description or other columns

In [None]:
from nomination_predictor.nomination_description_parser import \
    parse_descriptions_to_columns

dfs['cong_noms'] = parse_descriptions_to_columns(dfs['cong_noms'])

# Display sample results to verify extraction
sample_cols = [
    'description', 
    'nominee_name', 'nomination_of_or_from_location', 'nomination_to_position_title', 'nomination_to_court_name', 
    'nomination_predecessor_name', 'nomination_vacancy_reason', 'nomination_parsing_confidence'
]

df_to_inspect = dfs['cong_noms'][sample_cols]
display(df_to_inspect.head(20))

Unnamed: 0,description,nominee_name,nomination_of_or_from_location,nomination_to_position_title,nomination_to_court_name,nomination_predecessor_name,nomination_vacancy_reason,nomination_parsing_confidence
0,"Nicholas George Miranda, of the District of Co...",Nicholas George Miranda,District of Columbia,Associate Judge,Superior Court of the District of Columbia,Rupa Ranga Puttagunta,resigned,high
1,"Bruce W. Greer, of Florida, to be United State...",Bruce W. Greer,Florida,United States District Judge,Southern District of Florida,James W. Kehoe,retired,high
4,"Bruce D. Black, of New Mexico, to be United St...",Bruce D. Black,New Mexico,United States District Judge,District of New Mexico,Juan Guerrero Burciaga,retired,high
8,"Susan J. Dlott, of Ohio, to be United States D...",Susan J. Dlott,Ohio,United States District Judge,Southern District of Ohio,S. Arthur Spiegel,retired,high
12,"Hilda G. Tagle, of Texas, to be United States ...",Hilda G. Tagle,Texas,United States District Judge,Southern District of Texas,a new position created by Public Law 101-650,"approved December 1, 1990",high
13,"Kim McLane Wardlaw, of California, to be Unite...",Kim McLane Wardlaw,California,United States District Judge,Central District of California,David V. Kenyon,retired,high
14,"E. Richard Webber, of Missouri, to be United S...",E. Richard Webber,Missouri,United States District Judge,Eastern District of Missouri,Edward L. Filippine,retired,high
15,"Merrick B. Garland, of Maryland, to be United ...",Merrick B. Garland,Maryland,United States Circuit Judge,District of Columbia Circuit,Abner J. Mikva,retired,high
18,"Hugh Lawson, of Georgia, to be United States D...",Hugh Lawson,Georgia,United States District Judge,Middle District of Georgia,Wilbur D. Owens,retired,high
20,"Michael R. Murphy, of Utah, to be United State...",Michael R. Murphy,Utah,United States Circuit Judge,Tenth Circuit,Monroe G. McKay,retired,high


In [None]:
# Report extraction statistics with confidence breakdown
total_rows = len(dfs['cong_noms'])
name_filled = dfs['cong_noms']['nominee_name'].notna().sum()
location_filled = dfs['cong_noms']['nomination_of_or_from_location'].notna().sum()
court_filled = dfs['cong_noms']['nomination_to_court_name'].notna().sum()
predecessor_filled = dfs['cong_noms']['nomination_predecessor_name'].notna().sum()
confidence_counts = dfs['cong_noms']['nomination_parsing_confidence'].value_counts()

logger.info(f"Description parser results:")
logger.info(f"  Names extracted: {name_filled}/{total_rows} ({name_filled/total_rows:.1%})")
logger.info(f"  Locations extracted: {location_filled}/{total_rows} ({location_filled/total_rows:.1%})")
logger.info(f"  Courts extracted: {court_filled}/{total_rows} ({court_filled/total_rows:.1%})")
logger.info(f"  Predecessors extracted: {predecessor_filled}/{total_rows} ({predecessor_filled/total_rows:.1%})")
logger.info(f"  Confidence breakdown: {dict(confidence_counts)}")

[32m2025-07-17 21:08:04.063[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDescription parser results:[0m
[32m2025-07-17 21:08:04.064[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1m  Names extracted: 3103/3104 (100.0%)[0m
[32m2025-07-17 21:08:04.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1m  Locations extracted: 3103/3104 (100.0%)[0m
[32m2025-07-17 21:08:04.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1m  Courts extracted: 3101/3104 (99.9%)[0m
[32m2025-07-17 21:08:04.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1m  Predecessors extracted: 3051/3104 (98.3%)[0m
[32m2025-07-17 21:08:04.066[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1m  Confidence breakdown: {'high': np.int64(3103), 'medium': np.int64(1)}[0m


In [None]:
high_confidence = dfs['cong_noms']['nomination_parsing_confidence'] == 'high'
logger.info(f"  High confidence parses: {high_confidence.sum()}/{total_rows} ({high_confidence.mean():.1%})")

[32m2025-07-17 21:08:04.090[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1m  High confidence parses: 3103/3104 (100.0%)[0m


In [None]:
# immediately drop the parsing confidence column.  It wsa only needed for reporting on this section, and I would find retaining it to be confusing clutter.
dfs["cong_noms"].drop(columns=["nomination_parsing_confidence"], inplace=True)

### FJC Biography links

In [None]:
# create a new column in dfs["fjc_demographics"] titled "fjc_biography_url"
# whose contents are simply the string "http://www.fjc.gov/node/" concatenated before whatever integer # can be read from 
# that same row's "nid" column (or empty string if nid is not an integer)
# This won't have any utility for model training and evaluation, but our streamlit app may be able to make use of it.

svc = dfs["fjc_demographics"]

svc["fjc_biography_url"] = (
    "http://www.fjc.gov/node/" +
    pd.to_numeric(svc["nid"], errors="coerce")     # turn non-ints into NaN
      .dropna()                                    # keep only numeric nids
      .astype(int)                                 # cast to int for clean string
      .astype(str)   
)
dfs["fjc_demographics"] = svc

## Convert date strings and floats to ints or datetime objects

In [None]:
# Convert datetime-related columns to appropriate types
for name, df in dfs.items():
    for col in df.columns:
        col_lower = col.lower()
        
        # Convert date columns to datetime
        if "date" in col_lower:
            if df[col].dtype == "object":  # String columns
                logger.info(f"Converting date column {col} from string to datetime for {name}")
                df[col] = pd.to_datetime(df[col], errors="coerce")
        
        # Convert year columns to int
        elif "year" in col_lower:
            if df[col].dtype == "object" or "float" in str(df[col].dtype):
                logger.info(f"Converting year column {col} to integer for {name}")
                # First convert to float (to handle string representations of numbers)
                # Then convert to Int64 (pandas nullable integer type)
                df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        
        # Convert month columns to int
        elif "month" in col_lower:
            if df[col].dtype == "object" or "float" in str(df[col].dtype):
                logger.info(f"Converting month column {col} to integer for {name}")
                df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        
        # Convert day columns to int
        elif "_day" in col_lower:
            if df[col].dtype == "object" or "float" in str(df[col].dtype):
                logger.info(f"Converting day column {col} to integer for {name}")
                df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

[32m2025-07-17 21:08:04.257[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConverting month column birth_month to integer for fjc_judges[0m
[32m2025-07-17 21:08:04.262[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mConverting day column birth_day to integer for fjc_judges[0m
[32m2025-07-17 21:08:04.265[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mConverting year column birth_year to integer for fjc_judges[0m


[32m2025-07-17 21:08:04.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConverting month column death_month to integer for fjc_judges[0m
[32m2025-07-17 21:08:04.282[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mConverting day column death_day to integer for fjc_judges[0m
[32m2025-07-17 21:08:04.285[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mConverting year column death_year to integer for fjc_judges[0m
[32m2025-07-17 21:08:04.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mConverting date column recess_appointment_date_(1) from string to datetime for fjc_judges[0m
[32m2025-07-17 21:08:04.294[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mConverting date column nomination_date_(1) from string to datetime for fjc_judges[0m
[32m2025-07-17 21:08:04.299[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:

## Normalize several columns' string values to make matching them later easier

In [None]:
from nomination_predictor.name_matching import normalize_text

keywords_which_denote_string_columns_to_normalize = ("court", "circuit", "district", "description", "name", 'degree', "school")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col.casefold() for keyword in keywords_which_denote_string_columns_to_normalize) and df[col].dtype == object:
            logger.info(F"Normalizing all values within column named {col} in {name}")
            df[col] = df[col].apply(normalize_text)

[32m2025-07-17 21:08:04.820[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named last_name in fjc_judges[0m
[32m2025-07-17 21:08:04.833[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named first_name in fjc_judges[0m
[32m2025-07-17 21:08:04.853[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named middle_name in fjc_judges[0m
[32m2025-07-17 21:08:04.868[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named court_type_(1) in fjc_judges[0m
[32m2025-07-17 21:08:04.880[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named court_name_(1) in fjc_judges[0m
[32m2025-07-17 21:08:04.890[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8

## Count and display columns for which no value is unique:
looks like our left-merge of the two congress dataframes causes some blank columns, and we still have some that are populated but unhelpful..

In [None]:
# display counts of unique values in DataFrame columns:
for name, df in dfs.items():
    for col in sorted(df.columns):
        if df[col].nunique() <1:
            print(f"{name} - {col}: {df[col].nunique()} unique values")

cong_noms - islist: 0 unique values
cong_noms - nominees_0_introtext: 0 unique values


...so now's an okay time to delete them

In [None]:
for name, df in dfs.items():
    df = drop_unhelpfully_uninformative_columns(df)
    dfs[name] = df

Columns with limited unique values:
  - aba_rating_(4): 1 unique non-null value 'Well Qualified' (0.0% of rows) - KEEPING
  - appointing_president_(5): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointing_president_(6): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointment_title_(4): 1 unique non-null value 'Judge' (0.3% of rows) - KEEPING
  - appointment_title_(5): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - appointment_title_(6): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - ayes/nays_(4): 1 unique non-null value '' (0.3% of rows) - KEEPING
  - ayes/nays_(5): 1 unique non-null value '' (0.0% of rows) - KEEPING
  - ayes/nays_(6): 1 unique non-null value '' (0.0% of rows) - KEEPING
  - commission_date_(5): 1 unique non-null value '1949-02-02 00:00:00' (0.0% of rows) - KEEPING
  - commission_date_(6): 1 unique non-null value '1949-02-02 00:00:00' (0.0% of rows) - KEEPING
  - committee_action_date_(5)

# Name-matching FJC judges to Congress.gov nominees

## For confirmed judges

### Performing the matching operations

In [None]:
from nomination_predictor.name_matching import perform_exact_name_matching

results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices= perform_exact_name_matching(
    congress_df=dfs["cong_noms"],
    fjc_df=dfs["fjc_federal_judicial_service"],
    congress_name_col="nominee_name",
    fjc_name_col="judge_name"
)

[32m2025-07-17 21:08:06.149[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m123[0m - [1mStarting exact name matching with 3104 Congress records and 4720 FJC records[0m
[32m2025-07-17 21:08:06.805[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m142[0m - [1mPerforming first-pass join on last and first name[0m
[32m2025-07-17 21:08:06.835[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m149[0m - [1mFound 2365 total records with last+first name matches[0m
[32m2025-07-17 21:08:06.849[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m177[0m - [1mFound 52 ambiguous matches, attempting middle initial disambiguation[0m
[32m2025-07-17 21:08:06.849[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_n

In [None]:
# Show results
results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices.head()

Unnamed: 0,congress_index,congress_name,fjc_name,nid,match_type,ambiguous
0,4,bruce d. black,"black, bruce d.",1377866.0,first_and_last_name,False
1,8,susan j. dlott,"dlott, susan j.",1380076.0,first_and_last_name,False
2,12,hilda g. tagle,"tagle, hilda g.",1390611.0,first_and_last_name,False
3,13,kim mclane wardlaw,"wardlaw, kim mclane",1389371.0,first_and_last_name,False
4,14,e. richard webber,"webber, e. richard",1389491.0,first_and_last_name,False


In [None]:
# keep only *unambiguous* pairs
nid_map = (
    results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices[~results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices["ambiguous"]]        # drop rows still ambiguous
      .set_index("congress_index")["nid"]
)

In [None]:
# at long last, we have a way to bridge the gap between the congress.gov data and the fjc data

# we can now use the nid_map to add the nid column to the congress.gov data
dfs["cong_noms"]["nid"] = dfs["cong_noms"].index.to_series().map(nid_map)
cong_noms = dfs["cong_noms"]

## For unconfirmed judges

In practice, given the dataframes as I've got them as of typing this, this section doesn't find any remaining unconfirmed judges to match.

What this section _did_ accomplish was showing me that the presence of diacritical marks such as "ñ" or "é" in names was misleading the matching process.

Discovering and addressing that in much-earlier data-normalizing cells led to getting more matches in our confirmed-judges-matching notebook section.

### Supplementing with additional columns to aid matching

In [None]:
from nomination_predictor.name_matching import prep_fjc_other

dfs["fjc_other_nominations_recess"] = prep_fjc_other(fjc_other_df=dfs["fjc_other_nominations_recess"])

In [None]:
fjc_other_supplemented =dfs["fjc_other_nominations_recess"]

### Inspecting for the unconfirmed nominee matching possibilities

In [None]:
from nomination_predictor.features import link_unconfirmed_nominations

dfs["cong_noms"] = link_unconfirmed_nominations(dfs["cong_noms"], dfs["fjc_other_nominations_recess"])

[32m2025-07-17 21:08:07.353[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m123[0m - [1mStarting exact name matching with 457 Congress records and 828 FJC records[0m
[32m2025-07-17 21:08:07.483[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m142[0m - [1mPerforming first-pass join on last and first name[0m
[32m2025-07-17 21:08:07.504[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m149[0m - [1mFound 0 total records with last+first name matches[0m
[32m2025-07-17 21:08:07.505[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m153[0m - [1mNO last+first name matches found. Checking last-name-only matches for diagnosis...[0m
[32m2025-07-17 21:08:07.515[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperfor

### Deciding not to merge the "Other Nominations" dataframe (at least not yet)
In theory this could get us the fjc's perspective on more nominees who didn't get confirmed.  

In practice my runs of name-matching didn't find any unambiguous matches from this dataframe to the congress one.  

So I put the idea of merging it, too, on hold.  Can try another day, maybe after seeing whether the additional data would help, or if ever discovering something major could be fixed/improved about the name matcher.

# Combining the rest of the FJC data now that our congress dataframe has been enriched with FJC nid

## Demographics, Education, and Professional Career history

Before we combine FJC data, we have to consider whether/how to handle judges' education, job history, age, ABA rating, etc.  Most/all of the data in the "demographics" dataframe is unchanging over time, but that's very much _not_ true of the other dataframes.

The simplest way to handle it would be to left-merge on "nid" and only take the most recently-dated row, or row with the highest sequence number.  In most cases this would likely land on keeping the most prestigious degree or job.

However, it is entirely likely a judge's education or job history has changed substantially since their first nomination, and affected their qualifications for each later nomination.

All of these indicate to me that it's worth merging onto each row that judge's position, education, etc., not as of the most recent records available, but instead _as of when they were nominated._

That means we can't do a too-simple left-join of all of our FJC data.  Instead, now that we've done the step of matching NIDs to congress' data on nominations, we can use the "received date" for each congress citation as a cutoff date for when we lookup education and job records by "nid" -- so we can avoid mistakenly linking to a citation any employment & job records dated after that cutoff date.

Thankfully we do have the school, degree, and degree_year in the education record, for both their bachelors and their masters and their associate degree(s) and LLB and J.D. etc., so we can look that up.  The education dataframe even comes with a "sequence" number for each education record, which is another indicator of chronological order in addition to degree_year for any given "nid" lookup for a judge.

Job history is more challenging to deal with because literally every row entry in that dataframe lists it uniquely, but we do have the data available.  My earliest attempts to feature-engineer with it include looking for keywords in it, then creating boolean features for whether they did/didn't have experience in common-phrase-identifiable positions such as "Private practice" or "Attorney general" or "Navy" or "Army" etc. Theoretically a parser can look for the year spreads listed there as a rough indicator of amounts of experience gleaned from each professional role & when, but that may be too complicated for me to accomplish by the time I'm first presenting this work.

In [None]:

import pandas as pd

dfs["cong_noms"] = dfs["cong_noms"].merge(dfs["fjc_demographics"],  on="nid", how="left", suffixes=("", "_fjcdemographics"))

In [None]:
from nomination_predictor.time_aware_analysis import (
    merge_latest_career, merge_latest_education, merge_nearest_fed_service)

dfs["cong_noms"] = merge_latest_education(dfs["cong_noms"], dfs["fjc_education"])
dfs["cong_noms"] = merge_latest_career(dfs["cong_noms"], dfs["fjc_professional_career"])

In [None]:
# ――― Map the raw `degree` strings to an ordinal `highest_degree_level` ―――
import re

import numpy as np

# normalised pattern → numeric level
_DEGREE_MAP = {
    r"(s\.?j\.?d\.?|j\.?s\.?d\.?|ph\.?d\.?)":           6,  # research doctorates
    r"(ll\.?m\.?)":                                      5,  # Master of Laws
    r"(j\.?d\.?|doctor of jurisprudence|juris doctor)":   5,
    r"(ll\.?b\.?)":                                      5,  # historical bachelor of laws
    r"(m\.?d\.?)":                                       5,  # medical doctor (rare)
    r"(m\.?(a|s|b|p)\.?)":                               3,  # generic master’s
    r"(b\.?(a|s)\.?)":                                   2,  # bachelor’s
    r"(a\.?(a|s)\.?)":                                   1,  # associate degree
}

# compile one big regex for speed
_COMPILED = [(re.compile(p, re.I), lvl) for p, lvl in _DEGREE_MAP.items()]

def _degree_to_level(text: str | float) -> float:
    """Return numeric level or np.nan if missing/unrecognised."""
    if pd.isna(text):
        return np.nan
    for pat, lvl in _COMPILED:
        if pat.search(str(text)):
            return lvl
    return np.nan

dfs["cong_noms"]["highest_degree_level"] = dfs["cong_noms"]["degree"].apply(_degree_to_level).astype("Int64")

# quick sanity check
dfs["cong_noms"][["degree", "highest_degree_level"]].sample(10)

Unnamed: 0,degree,highest_degree_level
1018,j.d.,5.0
214,ll.m.,5.0
728,ll.b.,5.0
2158,j.d.,5.0
1494,,
956,ll.b.,5.0
1659,,
1391,j.d.,5.0
757,j.d.,5.0
3008,j.d.,5.0


In [None]:
dfs["cong_noms"] = merge_nearest_fed_service(dfs["cong_noms"], dfs["fjc_federal_judicial_service"])

# Feature engineering 

Above I created some columns, but in general it leaned away from generating new data, or interpolations, and more about cleaning or splitting apart existing data.

From here on in the notebook we'll tread `df` as our working copy instead of `dfs["cong_noms"]`

In [None]:
from datetime import date

from nomination_predictor.time_aware_analysis import (
    congress_number, congress_session, days_into_current_term,
    days_until_next_midterm_election, days_until_next_presidential_election,
    fill_missing_appointing_presidents,
    fill_missing_party_of_appointing_presidents, normalize_party_codes,
    presidential_term_index)

df = dfs["cong_noms"].copy()
df["receiveddate"] = pd.to_datetime(df["receiveddate"])   # ensure datetime

## Target variables for downstream notebook to train a model to predict:

### Days from nomination to latest action

number of days between `receiveddate` and `latestaction_actiondate`

This includes both confirmed nominations and returned ones.

We don't have as much biographical / education / ABA qualification rating / job history / etc. for returned nominations.

And for reasons described in a section below, their FJC `seat_id` values aren't representative of the same concept as for confirmed nominations; treating them the same for both would make for an inaccurate model.

So at this time I'm not recommending this as our primary target variable. I'm mostly collecting it as a prerequisite calculation for a cell below, and we can retain it for curiosity's sake through downstream Exploratory Data Analysis.

In [None]:
print("Calculating days from nomination to latest action...")

# Initialize the new column
df['days_nom_to_latest_action'] = pd.NA

# Create mask for rows with both dates available
valid_dates_mask = df['receiveddate'].notna() & df['latestaction_actiondate'].notna()

if valid_dates_mask.any():
    # Calculate the difference in days
    df.loc[valid_dates_mask, 'days_nom_to_latest_action'] = (
        (df.loc[valid_dates_mask, 'latestaction_actiondate'] - 
         df.loc[valid_dates_mask, 'receiveddate']).dt.days
    )
    
    # Optional: Handle negative values (if any latest action dates are before received dates)
    negative_days_mask = df['days_nom_to_latest_action'] < 0
    if negative_days_mask.any():
        logger.warning(f"{negative_days_mask.sum()} rows have negative duration (latest action before received date)")
        logger.warning("Sample of problematic rows:")
        display(df.loc[negative_days_mask, ['receiveddate', 'latestaction_actiondate', 'days_nom_to_latest_action']].head(3))
        
        # You can decide to either keep negative values or set them to NA
        # df.loc[negative_days_mask, 'days_nom_to_latest_action'] = pd.NA  # Uncomment to remove negative values

# Print summary statistics
days_count = df['days_nom_to_latest_action'].notna().sum()
logger.info(f"Successfully calculated duration for {days_count} nominees ({days_count/len(df):.1%} of dataset)")

if days_count > 0:
    logger.info(f"Duration statistics (days):")
    logger.info(f"- Min: {df['days_nom_to_latest_action'].min()} days")
    logger.info(f"- Max: {df['days_nom_to_latest_action'].max()} days")
    logger.info(f"- Mean: {df['days_nom_to_latest_action'].mean():.1f} days")
    logger.info(f"- Median: {df['days_nom_to_latest_action'].median():.1f} days")

Calculating days from nomination to latest action...
[32m2025-07-17 21:08:20.134[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mSuccessfully calculated duration for 3104 nominees (100.0% of dataset)[0m
[32m2025-07-17 21:08:20.136[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mDuration statistics (days):[0m
[32m2025-07-17 21:08:20.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1m- Min: 1 days[0m
[32m2025-07-17 21:08:20.138[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1m- Max: 727 days[0m
[32m2025-07-17 21:08:20.139[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m- Mean: 154.9 days[0m
[32m2025-07-17 21:08:20.141[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1m- Median: 122.0 days[0m


### Days from nomination to confirmation

This would allow training a model based only on confirmed cases.

In [None]:
print("Creating days_nom_to_conf column for confirmed nominations only...")

# Initialize the new column
df['days_nom_to_conf'] = pd.NA

# Create a mask for confirmed nominations
confirmed_mask = df['latestaction_text'].str.contains('confirmed', case=False, na=False)

# Copy values from days_nom_to_latest_action for confirmed nominations only
if confirmed_mask.any():
    df.loc[confirmed_mask, 'days_nom_to_conf'] = df.loc[confirmed_mask, 'days_nom_to_latest_action']
    
    # Print summary statistics
    confirmed_count = confirmed_mask.sum()
    logger.info(f"Found {confirmed_count} confirmed nominations ({confirmed_count/len(df):.1%} of dataset)")
    
    days_conf_count = df['days_nom_to_conf'].notna().sum()
    logger.info(f"Successfully calculated confirmation duration for {days_conf_count} nominees")
    
    if days_conf_count > 0:
        logger.info(f"Confirmation duration statistics (days):")
        logger.info(f"- Min: {df['days_nom_to_conf'].min()} days")
        logger.info(f"- Max: {df['days_nom_to_conf'].max()} days")
        logger.info(f"- Mean: {df['days_nom_to_conf'].mean():.1f} days")
        logger.info(f"- Median: {df['days_nom_to_conf'].median():.1f} days")
else:
    logger.warning("No confirmed nominations found in the dataset")

Creating days_nom_to_conf column for confirmed nominations only...
[32m2025-07-17 21:08:20.165[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mFound 2126 confirmed nominations (68.5% of dataset)[0m
[32m2025-07-17 21:08:20.168[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mSuccessfully calculated confirmation duration for 2126 nominees[0m
[32m2025-07-17 21:08:20.168[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mConfirmation duration statistics (days):[0m
[32m2025-07-17 21:08:20.170[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1m- Min: 3 days[0m
[32m2025-07-17 21:08:20.171[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m- Max: 684 days[0m
[32m2025-07-17 21:08:20.172[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1m- Mean: 130.4 days[0m
[32m2025-07-17 21:08:20.176[0m | [1mINFO    [0m | 

## Removing expedited nominations

`isPrivileged` Flag indicates whether the nomination is privileged and entitled to expedited procedures.

Before we removed non-judge roles in earlier notebook cells, these were only about 68 out of ~2000 civilian nominations we had the option of looking at.  And now that we're only looking for judges it's even fewer.

Majority of our entries for this field are missing, and the handful that aren't are all provided to us as True, so it feels safe to assume the rest can be presumed as False.

There's few-enough of them, and by definition it causes different-enough political procedures, that removing them is a good idea to focus our model training & predictions on normal nominations.

In [None]:
# fill all missing values of the "isprivileged" column with the boolean False
df['isprivileged'] = df['isprivileged'].fillna(False)
print(f"This many rows listed the 'isprivileged' column as True: {df['isprivileged'].sum()}")
print(f"This many rows did not list the 'isprivileged' column as True: {len(df) - df['isprivileged'].sum()}")
logger.info(f"Removing the {df['isprivileged'].sum()} rows where 'isprivileged' is True...")
df = df[df["isprivileged"] != True].copy()

This many rows listed the 'isprivileged' column as True: 2
This many rows did not list the 'isprivileged' column as True: 3102
[32m2025-07-17 21:08:20.194[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mRemoving the 2 rows where 'isprivileged' is True...[0m


  df['isprivileged'] = df['isprivileged'].fillna(False)


## Pull yea/nay votes and confirmed/returned from description into their own columns

In [None]:
from nomination_predictor.latestaction_parser import enrich_latest_action

df = enrich_latest_action(df)

## Categorizing seats: the difference between FJC `seat_ID` vs. parsing Congress API's `description`

At some point we have to categorize where judges held seats & where they've been nominated to.

Among the convenient things about FJC's `seat_id` is that the numeric portion can be lopped off to make an easy categorical indicator of the circuit/district the judge worked in.

In [None]:
# for rows where df["latestaction_text"] == confirmed, copy only the non-numeric portions of df["seat_id"] to a new column df["seat_id_letters_only"]
df["seat_id_letters_only"] = df[df["latest_action_taken"] == "confirmed"]["seat_id"].str.extract(r"([A-Za-z]+)")

But importantly, given the way above notebook sections merge data, FJC's `seat_id` isn't necessarily an indicator of where the judge is applying _to_, it's an indicator of where the judge _worked_, as of the FJC data row above cells left-merged.

That's the same thing in the case of a successful confirmation, but is _not_ true for nominations which got returned.

If we really want the position they got nominated for, we can get that more confidently -- for both confirmed and unconfirmed nominees -- from the column we created earlier based on `description`: `nomination_to_position_title`

In [None]:
# identify seat level
df["seat_level"] = (
    df["nomination_to_position_title"]
      .str.lower()
      .str.extract(r"(supreme|circuit|district|(?<=\s)tax|international|appeals)") # insists on whitespace before "tax" so we can tell "tax" or "taxation" etc. are a standalone word
      .fillna("other")
)

In [None]:
seat_counts = df["seat_level"].value_counts()
print(f"\nDistribution of seat levels (total {len(df)} records):")
for seat, count in seat_counts.items():
    print(f"- {seat}: {count} ({count/len(df):.1%})")

# Show some examples for each seat level for validation
print("\nExample descriptions for each seat level:")
for seat_type in df["seat_level"].unique():
    examples = df[df["seat_level"] == seat_type]["description"].sample(min(2, df[df["seat_level"] == seat_type].shape[0]))
    print(f"\n{seat_type.upper()} examples:")
    for ex in examples:
        print(f"  • {ex}")


Distribution of seat levels (total 3102 records):
- district: 2011 (64.8%)
- circuit: 588 (19.0%)
- other: 470 (15.2%)
- appeals: 31 (1.0%)
- tax: 2 (0.1%)

Example descriptions for each seat level:

OTHER examples:
  • mary ellen coster williams, of maryland, to be a judge of the united states court of federal claims for a term of fifteen years, vice sarah l. wilson.
  • tovah r. calderon, of the district of columbia, to be an associate judge of the district of columbia court of appeals for the term of fifteen years, vice kathryn a. oberly, retired.

DISTRICT examples:
  • christy criswell wiegand, of pennsylvania, to be united states district judge for the western district of pennsylvania, vice peter j. phipps, elevated.
  • james david cain, jr., of louisiana, to be united states district judge for the western district of louisiana, vice patricia head minaldi, retired.

CIRCUIT examples:
  • stephanos bibas, of pennsylvania, to be united states circuit judge for the third circuit, 

#### Why drop supreme courts & merge some others?
Supreme Court confirmations follow a distinct political logic (public hearings, cloture dynamics, national spotlight) and appear only a handful of times in our data.

Mathematically, keeping them risks the model treating them as noise & under-performing on everything else.

Tax, International Trade, and specialist appeals courts each comprise so few rows that merging them prevents the model from over‑fitting noise, while still retaining their information under a generic “other” flag.

Building the primary model on the district and circuit categories (vast majority of the data) plus an “other” bucket yields a cleaner, more reliable signal.

In [None]:
# Recode seat_level
sparse_levels = ["tax", "appeals", "international", "other"]
df["seat_level_fjc_recategorized"] = df["seat_level"].replace(
    {lvl: "other" for lvl in sparse_levels}
)
# Drop supreme court rows
df = df[df["seat_level_fjc_recategorized"] != "supreme"].copy()

# Quick sanity check
print(df["seat_level_fjc_recategorized"].value_counts(normalize=True).round(3))

seat_level_fjc_recategorized
district    0.648
circuit     0.190
other       0.162
Name: proportion, dtype: float64


## Senate political eras

EDA shows our data isn't stationary -- since 1980 there's been a trend for confirmations to take longer over time (with some indications it may be speeding up again, but not enough time has passed to show that with the same certainty).

Rather than stratifying our downstream train-test splits by arbitrary years, it'd be more accurate to stratify by when the senate's rules and procedures experienced historically noteworthy changes.

We'll add a column to denote that.  As of typing I'm undecided whether it'd be more conceptually accurate to pin it on the date nomination was received vs. the date nominations were confirmed, so I'll just pick the date nomination was received because it's simple to apply to unconfirmed nominations as wel, and would mean the same thing in either type of row.

In [None]:
df["received_in_senate_political_era"] = pd.cut(
    df["receiveddate"],
    bins=pd.to_datetime([
        "1900-01-01",       # Start bin well below lower bound of data
        "2013-11-21",       # Nuclear option I; https://en.wikipedia.org/wiki/Nuclear_option
        "2017-04-06",       # Nuclear option II; https://www.politico.com/story/2017/04/senate-neil-gorsuch-nuclear-option-236937
        "2019-04-03",       # Debate‑time cut; https://www.congress.gov/crs-product/RL31980
        "2030-01-01"        # semi-arbitrary future end date for last bin; https://www.grassley.senate.gov/news/news-releases/grassley-we-are-upholding-blue-slip-courtesy-vast-majority-judiciary-chairman
    ]),
    labels=["pre‑2013", "2013‑2017", "2017‑2019", "post‑2019"],
    right=False
)

## Presidency-related

In [None]:

# presidency- and elections-timeline-related
df["pres_term_idx"]  = df["receiveddate"].apply(presidential_term_index)
df["days_into_pres_term"] = df["receiveddate"].apply(days_into_current_term)
df["days_to_next_pres_election"] = df["receiveddate"].apply(days_until_next_presidential_election)
df["days_to_next_midterm_election"]  = df["receiveddate"].apply(days_until_next_midterm_election)
df["congress_num"] = df["receiveddate"].apply(congress_number)
df["congress_session"] = df["receiveddate"].apply(congress_session)

# remove letters "PN" prefixing every citation, and converting remaining numbers to integer, to make it more obviously numeric variable
df["citation"] = df["citation"].str.replace("PN", "").astype(int)

In [None]:
# apply function to subtract 1 from each non-missing value in "pres_term_idx", then convert it to a boolean (replace all values <= 0 with "False" and all positive values with "True")
df["pres_term_idx"] = df["pres_term_idx"].apply(lambda x: x - 1 if pd.notna(x) else x)
df.rename(columns={"pres_term_idx": "pres_term_is_latter_term"}, inplace=True)

In [None]:
print(dfs["cong_noms"].head())

                                             request  actions_count  \
0  {'congress': '118', 'contentType': 'applicatio...              6   
1  {'congress': '104', 'contentType': 'applicatio...              2   
2  {'congress': '104', 'contentType': 'applicatio...              5   
3  {'congress': '104', 'contentType': 'applicatio...              6   
4  {'congress': '104', 'contentType': 'applicatio...              2   

                                         actions_url authoritydate citation  \
0  https://api.congress.gov/v3/nomination/118/201...    2025-05-12   PN2013   
1  https://api.congress.gov/v3/nomination/104/556...           NaT    PN556   
2  https://api.congress.gov/v3/nomination/104/582...           NaT    PN582   
3  https://api.congress.gov/v3/nomination/104/587...           NaT    PN587   
4  https://api.congress.gov/v3/nomination/104/596...           NaT    PN596   

   committees_count                                     committees_url  \
0               1.0  htt

In [None]:
df = fill_missing_appointing_presidents(df) 

df = fill_missing_party_of_appointing_presidents(df)

df = normalize_party_codes(df, party_columns=['party_of_appointing_president', 'senate_party', 'house_party'])

# Show some examples for each seat level for validation
print("\nExample nominations from each appointing president:")
for prez in df["appointing_president"].unique():
    # Sample data - get both description and receiveddate 
    sample_rows = df[df["appointing_president"] == prez].sample(
        min(2, df[df["appointing_president"] == prez].shape[0])
    )
    
    print(f"\n{prez.upper()} examples:")
    for _, row in sample_rows.iterrows():
        # Format the date nicely
        date_str = row["receiveddate"].strftime("%B %d, %Y") if pd.notna(row["receiveddate"]) else "No date"
        
        # Print description with date
        print(f"  • [{date_str}] {row['description']}")

[32m2025-07-17 21:08:20.622[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mfill_missing_appointing_presidents[0m:[36m222[0m - [1mFilled 1258 missing appointing president values using nomination dates[0m
[32m2025-07-17 21:08:20.633[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mfill_missing_party_of_appointing_presidents[0m:[36m261[0m - [1mFilled 1258 missing party of appointing president values using nomination dates[0m
[32m2025-07-17 21:08:20.652[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mnormalize_party_codes[0m:[36m389[0m - [1mNormalized 3102 party codes in column 'party_of_appointing_president'[0m

Example nominations from each appointing president:

JOSEPH ROBINETTE BIDEN examples:
  • [April 30, 2024] danna r. jackson, of the district of columbia, to be united states district judge for the district of montana, vice dana l. christensen, retiring.
  • [March 21, 

## Judges' ages

This'll be used if ever asking things like "How much does judge's age affect approval?"

Among the hypotheses is that older judges tend to get approved faster because there's not as much concern they'll live long enough to have as much of a total impact over their time in office.

In [None]:
# do we even have enough birthdays to get statistical significance or help model training?
print(f"Merged dataframe has {df['birth_day'].notna().mean()*100}% of rows with birth day")
print(f"Merged dataframe has {df['birth_month'].notna().mean()*100}% of rows with birth month")
print(f"Merged dataframe has {df['birth_year'].notna().mean()*100}% of rows with birth year")

Merged dataframe has 10.9284332688588% of rows with birth day
Merged dataframe has 10.9284332688588% of rows with birth month
Merged dataframe has 74.27466150870407% of rows with birth year


That tells me the FJC didn't offer us enough data yet to get granular down to the day.  At best, for the majority of judges, we can approximate by year.  Let's pretend for simplicity that everyone whose month or day is absent was born exactly in the middle of their birth year.

In [None]:
# Reference date for "future" check
yesterday = pd.Timestamp.today().normalize() - pd.Timedelta(days=1)

# Stage 1: Create approximate birth date using all available components
df['birth_date_approx_dt'] = pd.NaT

# Case 1: Complete birth date (year, month, day all available)
complete_date_mask = df['birth_year'].notna() & df['birth_month'].notna() & df['birth_day'].notna()
if complete_date_mask.any():
    for idx, row in df.loc[complete_date_mask].iterrows():
        try:
            # Use year, month, and day as integers
            year = int(row['birth_year'])
            month = int(row['birth_month'])
            day = int(row['birth_day'])
            df.at[idx, 'birth_date_approx_dt'] = pd.Timestamp(year=year, month=month, day=day)
        except Exception as e:
            # Skip rows with invalid date components
            pass
    
    success_count = df.loc[complete_date_mask, 'birth_date_approx_dt'].notna().sum()
    print(f"Used complete birth dates for {success_count} nominees")

# Case 2: Only birth year available (use mid-year July 1st)
year_only_mask = df['birth_year'].notna() & df['birth_date_approx_dt'].isna()
if year_only_mask.any():
    # Direct integer conversion - no more .year attribute access
    df.loc[year_only_mask, 'birth_date_approx_dt'] = df.loc[year_only_mask, 'birth_year'].apply(
        lambda yr: pd.Timestamp(year=int(yr), month=7, day=1)
    )
    print(f"Used mid-year approximation for {year_only_mask.sum()} nominees with only birth year")

# Filter out any future birth dates
future_mask = df['birth_date_approx_dt'] > yesterday
if future_mask.any():
    print(f"Warning: {future_mask.sum()} birth dates were in the future and set to NaT")
    df.loc[future_mask, 'birth_date_approx_dt'] = pd.NaT

Used complete birth dates for 339 nominees
Used mid-year approximation for 1965 nominees with only birth year


In [None]:

# Stage 2: Calculate age at nomination in days
df['age_at_nom_days'] = pd.NA  # Clear naming convention: age in days
valid_mask = df['birth_date_approx_dt'].notna() & df['receiveddate'].notna()
if valid_mask.any():
    # Calculate age in days (integer) instead of fractional years
    df.loc[valid_mask, 'age_at_nom_days'] = (
        (df.loc[valid_mask, 'receiveddate'] - df.loc[valid_mask, 'birth_date_approx_dt']).dt.days
    ).astype('Int64')  # Use pandas nullable integer type

# Print summary of age calculation
age_count = df['age_at_nom_days'].notna().sum()
print(f"Successfully calculated age for {age_count} nominees ({age_count/len(df):.1%} of dataset)")
if age_count > 0:
    # Also provide age in years for the summary statistics for readability
    years_stats = df['age_at_nom_days'].dropna() / 365.25
    print(f"Age statistics: min={years_stats.min():.1f} years ({df['age_at_nom_days'].min()} days), "
          f"max={years_stats.max():.1f} years ({df['age_at_nom_days'].max()} days), "
          f"avg={years_stats.mean():.1f} years ({df['age_at_nom_days'].mean():.1f} days)")

Successfully calculated age for 2304 nominees (74.3% of dataset)
Age statistics: min=30.8 years (11255 days), max=234.0 years (85482 days), avg=50.6 years (18498.9 days)


Alternate perspective: The hypothesis of "less scrutiny for nominees who won't be around much longer" gets even more interesting if instead of just considering how many years they've lived through, we had a way of considering nominees who were in remarkably, visibly and/or audibly and/or rumored good or poor health, enough for the Senators voting on them to be able to notice & anticipate their death as more imminent or less imminent.

We don't have health records or cause of death for each judge & nominee, but the next-best thing we can calculate is how soon after getting nominated they were recorded as having died:

In [None]:
# Create accurate death dates using all available components
# Initialize death date column
df['death_date_exact'] = pd.NaT

# Case 1: Complete death date (year, month, day all available)
complete_date_mask = df['death_year'].notna() & df['death_month'].notna() & df['death_day'].notna()
if complete_date_mask.any():
    for idx, row in df.loc[complete_date_mask].iterrows():
        try:
            # Create exact death date using all components
            year = int(row['death_year'])
            month = int(row['death_month'])
            day = int(row['death_day'])
            df.at[idx, 'death_date_exact'] = pd.Timestamp(year=year, month=month, day=day)
        except Exception as e:
            # Skip rows with invalid date components
            pass
    
    success_count = df.loc[complete_date_mask, 'death_date_exact'].notna().sum()
    print(f"Used complete death dates for {success_count} nominees")

# Case 2: Only death year available (use mid-year July 1st)
year_only_mask = df['death_year'].notna() & df['death_date_exact'].isna()
if year_only_mask.any():
    df.loc[year_only_mask, 'death_date_exact'] = df.loc[year_only_mask, 'death_year'].apply(
        lambda yr: pd.Timestamp(year=int(yr), month=7, day=1)
    )
    print(f"Used mid-year approximation for {year_only_mask.sum()} nominees with only death year")

# Initialize column with nullable integer type
df['days_nom_to_deceased'] = pd.NA  

# Calculate days between nomination and death only where death date is available
valid_mask = df['death_date_exact'].notna() & df['receiveddate'].notna()
if valid_mask.any():
    df.loc[valid_mask, 'days_nom_to_deceased'] = (
        df.loc[valid_mask, 'death_date_exact'] - df.loc[valid_mask, 'receiveddate']
    ).dt.days.astype('Int64')  # Use pandas nullable integer type

# Print summary of calculation
days_count = df['days_nom_to_deceased'].notna().sum()
print(f"Successfully calculated days from nomination to death for {days_count} nominees ({days_count/len(df):.1%} of dataset)")
if days_count > 0:
    print(f"Days statistics: min={df['days_nom_to_deceased'].min()}, "
          f"max={df['days_nom_to_deceased'].max()}, "
          f"avg={df['days_nom_to_deceased'].mean():.1f}")

Used complete death dates for 339 nominees
Successfully calculated days from nomination to death for 339 nominees (10.9% of dataset)
Days statistics: min=-66520, max=15897, avg=7335.9


## Replacing judges vs. filling newly-opened seats

In [None]:
# few-enough rows are new seats & the statues authorizing them are varied enough that it'd be more useful to turn it from a string into a boolean indicating True if the string is neither missing nor an empty string, False if it's either missing or an empty string.
df["statute_authorized_new_seat_bool"] = df["statute_authorizing_new_seat"].apply(lambda x: bool(x))

## Seat level

## Unified vs. divided government

Ideally I'd want to use the database behind voteview to get data on how conservative/liberal/etc. the Congress and Senate were at time of nomination and confirmation.

Voteview has shared an R package from https://github.com/voteview/Rvoteview which looks pretty promising as a way of getting more granular data than just which party controls each half of the legislature.

But their website has been down every time I've checked in the past couple of weeks, so I gave up on that idea so far.

Lacking that, next best idea I could think to replace it with would be to populate columns for Senate & Congress party composition, at least enough to populate booleans in columns such as:

| Field Name | Description | Source |
|------------|-------------|--------|
| `nom_is_unified` | At time of nomination, President's party holds a majority in both the House and the Senate. | Calculated from receiveddate column |
| `nom_is_div_opp_house` | At time of nomination, President's party holds a majority in the Senate, but the opposition controls the House. | Calculated from receiveddate column |
| `nom_is_div_opp_senate` | At time of nomination, President's party holds a majority in the House, but the opposition controls the Senate. | Calculated from receiveddate column |
| `nom_is_fully_div` | At time of nomination, opposition party controls both the House and the Senate. | Calculated from receiveddate column |
| `latestaction_is_unified` | At time of latest action, President's party holds a majority in both the House and the Senate. | Calculated from latestaction_actiondate column |
| `latestaction_is_div_opp_house` | At time of latest action, President's party holds a majority in the Senate, but the opposition controls the House. | Calculated from latestaction_actiondate column |
| `latestaction_is_div_opp_senate` | At time of latest action, President's party holds a majority in the House, but the opposition controls the Senate. | Calculated from latestaction_actiondate column |
| `latestaction_is_fully_div` | At time of latest action, opposition party controls both the House and the Senate. | Calculated from latestaction_actiondate column |

In [None]:
from nomination_predictor.congress_party_utils import add_alignment_flags

df = add_alignment_flags(df, "party_of_appointing_president", "receiveddate", "latestaction_actiondate")

# Re-check for unhelpfully uninformative columns to delete

In [None]:
df = drop_unhelpfully_uninformative_columns(df)

Columns with limited unique values:
Error analyzing column 'fed_service_sequence': can only convert an array of size 1 to a Python scalar
Error analyzing column 'fed_service_sequence': can only convert an array of size 1 to a Python scalar
  - isprivileged: 1 unique value, 100% populated with 'False' - DROPPING
  - party_of_reappointing_president: 1 unique non-null value 'Republican' (0.0% of rows) - KEEPING
  - reappointing_president: 1 unique non-null value 'George W. Bush' (0.0% of rows) - KEEPING
Dropped column: isprivileged

Dropped 1 columns that were uninformative


# Yet more column drops

By this far downstream the pipeline of notebooks we're close enough to outputting reports or training that there isn't much need to keep certain columns I'd been holding onto for development/diagnostic/traceability purposes.

Let's streamline to ones more relevant to vizualizations or model training.

In [64]:
columns_to_drop = [
    "request", # how we got info by API
    "actions_url", # how we could get yet more info by API
    "authoritydate", # The date when the Senate granted authority to the Secretary of the Senate to receive nominations during periods of recess or adjournment.  Useful for a future revision if ever wanting to train/predict based on # of days Senate was in session rather than calendar days, but distracting until then.
    "committees_url", # how we could get yet more info by API
    "description", # if we haven't already processed it enough, now's rather late; almost every value's unique & downstream notebooks would rather have its useful portions broken down into separate columns
    "hearings_count", # too many values missing to use confidently; I think we'd get better hearings data if a future version made use of hearings_url via API
    "hearings_url", # see hearings_count
    "latestaction_text", # for similar reasons to dropping the description text; by now the useful bits of this ought to be broken apart to other columns
    "nominees_0_positiontitle", # already available broken apart into court_type and nomination_to_position_title.  Note seat_id_letters_only is NOT necessarily an equivalent-concept match for this for unconfirmed nominees, for reasons described in earlier sections.
    "nominees_0_url", # how we already got nominee info by API
    "number" , # by now, exact same thing as what we removed the PN prefix to turn "citation" column into, just with a less-intuitive name
    "request_number", # see number
    "congress", # already have congress_num
    "request_congress", # already have congress_num
    "updatedate", # The date of update in Congress.gov. Useful for confirming recency of recordskeeping, useless for informing about what happened that is worth having recorded records about
    "executivecalendarnumber", # updated each day the Senate is in session. Like authorityDate, could be useful to a future update/expansion to this project.  But for now too many values missing from data, and even in the future may be better filled in via lookup table retrieved from elsewhere.
    "nominees_0_predecessorname", # likely useful for future versions tracing breadcrumbs back through seat history, but I don't see how I'll utilize it with the visuals or training I'm building now
    "jid", # only useful for fjc's legacy work before they introduced nid
    "birth_month", # too many missing to make use of, especially for more-recent data; am guessing the reason it tends to be filled for older records and absent from newer ones is because judges' privacy started becoming a concern
    "birth_day", # see birth_month
    "death_month", # largely missing; could easily be either for same above-suspected privacy reasons as birth_month, or because records are recent enough many just haven't died yet
    "death_day", # see death_month
    "reappointing_president", # nowhere near enough of these to be useful to model training
    "party_of_reappointing_president", # see reappointing_president
    "recess_appointment_date", # too many missing. Not enough instances to seem useful.
    "ayes/nays", # by now already broken out into separate columns
]
df = df.drop(columns=columns_to_drop)

# Not-yet-implemented analyses ideas:

In [None]:
# Partisan mismatch: 1 if president_party != party__who_appointed_predecessor
# this would require an additional step of linking predecessor name to nid, and way of looking up when the judge with that nid had been in service, possibly even needing to be a date-and-location-aware analysis

#party_map = {47: "R", 46: "D", 45: "R", 44: "D", 43: "R", 42: "D", 41: "R"}  # extend list
#df["pres_party"] = df["receiveddate"].apply(lambda d: party_map.get(president_number(d), None))
#df["partisan_mismatch"] = (
#    (df["pres_party"].notna()) &
#    (df["pres_party"] != df["party_of_appointing_president"])
#)

In [None]:

# skipping this one because I think we'd get more and/or richer info out of it if we had a more-successful linkage of the unconfirmed nomination rows between congress and fjc's data
# #Count prior failed nominations for this seat_id (if column present)

#if "other_nominations_count" not in df.columns and "seat_id" in df.columns:
#    prior_counts = (
#        df.groupby("seat_id").cumcount()  # number seen so far for that seat
#    )
#    df["num_prior_failed_noms"] = prior_counts
#
#display(df.head())
#feature_engineered_df = df.copy()

# Saving interim dataframes

In [None]:
# Save to interim data
df.to_csv( INTERIM_DATA_DIR /"feature_engineered.csv", index=False)

In [None]:
# Save extracted tables to interim directory
for name, df in dfs.items():
    if len(df) > 0:  # Only save non-empty DataFrames
        output_path = INTERIM_DATA_DIR / f"{name}.csv"
        df.to_csv(output_path, index=False)
        print(f"Saved {len(df)} records to {output_path}")

Saved 4022 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_judges.csv
Saved 4720 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_federal_judicial_service.csv
Saved 4022 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_demographics.csv
Saved 8040 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_education.csv
Saved 611 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_other_federal_judicial_service.csv
Saved 828 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_other_nominations_recess.csv
Saved 19003 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_professional_career.csv
Saved 5487 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_nominations.csv
Saved 5517 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_nominees.csv
Saved 3104 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_