# Notebook 1 – Data Cleaning, Feature Engineering, & Entity Resolution
**Project:** Judicial Vacancy → Nomination/Confirmation Pipeline

*Initial draft generated via ChatGPT model o3 on 2025-07-12T02:40:38.399372Z*

In [None]:

import sys
from pathlib import Path

import pandas as pd
from loguru import logger

# Add the project root to the path so we can import our modules
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))


# Setup logging
logger.remove()  # Remove default handler
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{function}</cyan> - <level>{message}</level>", level="INFO")

1

## Load dataframes from Raw data folder

Start with loading simpler, non-JSON-containing CSV files

In [None]:
from nomination_predictor.config import INTERIM_DATA_DIR, RAW_DATA_DIR

# load FJC dataframes (and derived seat timeline)
fjc_judges = pd.read_csv(RAW_DATA_DIR / "judges.csv")
fjc_federal_judicial_service = pd.read_csv(RAW_DATA_DIR / "federal_judicial_service.csv")
fjc_demographics = pd.read_csv(RAW_DATA_DIR / "demographics.csv")
fjc_education = pd.read_csv(RAW_DATA_DIR / "education.csv")
fjc_other_federal_judicial_service = pd.read_csv(
    RAW_DATA_DIR / "other_federal_judicial_service.csv"
)
fjc_other_nominations_recess = pd.read_csv(RAW_DATA_DIR / "other_nominations_recess.csv")
fjc_professional_career = pd.read_csv(RAW_DATA_DIR / "professional_career.csv")
#seat_timeline = pd.read_csv(RAW_DATA_DIR / "seat_timeline.csv")

[32m2025-07-15 12:34:01.404[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m103[0m - [1mProject root: /home/wsl2ubuntuuser/nomination_predictor[0m
[32m2025-07-15 12:34:01.406[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m127[0m - [1mConfiguration loaded[0m


### Combine all dataframes into a single dictionary for bulk operations

In [None]:
# Combine all dataframes into a single dictionary for bulk operations


# Start with FJC dataframes; we'll add Congress ones soon
dfs = {
    # FJC dataframes
    "fjc_judges": fjc_judges,
    "fjc_federal_judicial_service": fjc_federal_judicial_service,
    "fjc_demographics": fjc_demographics,
    "fjc_education": fjc_education,
    "fjc_other_federal_judicial_service": fjc_other_federal_judicial_service,
    "fjc_other_nominations_recess": fjc_other_nominations_recess,
    "fjc_professional_career": fjc_professional_career,
    #"seat_timeline": seat_timeline,
    
    # Congress dataframes
}

### Load & immediately drop duplicated rows from congress API data

If we made the same request multiple times and got the same response (e.g. from a software design oversight, or pausing & resuming/mashing together downloads made on separate occasions with possible overlap)

It's easier to find this kind of duplicate now vs. after flattening.

In [None]:
from nomination_predictor.features import flatten_json_dataframe

# Load Congress API dataframes
cong_nominations_raw = pd.read_csv(RAW_DATA_DIR / "nominations.csv")
cong_nominees_raw = pd.read_csv(RAW_DATA_DIR / "nominees.csv")

dfs["cong_nominations"] = cong_nominations_raw
dfs["cong_nominees"] = cong_nominees_raw

In [None]:
# ------------------------------------------------------------------
# 1. Identify the rows that *would* be dropped
dupe_mask = dfs["cong_nominations"].duplicated(subset=["nomination", "request"], keep="first")
dupes      = dfs["cong_nominations"].loc[dupe_mask].copy()

# ------------------------------------------------------------------
# 2. Show a compact summary
print(f"Rows flagged as duplicates: {len(dupes)}")
display(
    dupes.sort_values(["nomination", "request"])
         .head(20)   # show first 20; remove .head() to see all
)

# Optional: see how many duplicates per citation
dup_counts = (
    dfs["cong_nominations"]
      .loc[dupe_mask, "nomination"]
      .value_counts()
      .head(10)
)
print("\nTop duplicate records:")
display(dup_counts)

Rows flagged as duplicates: 232


Unnamed: 0,nomination,request,retrieval_date,is_full_detail
5011,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '100', 'contentType': 'applicatio...",2025-07-12,True
5008,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '100', 'contentType': 'applicatio...",2025-07-12,True
4780,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '101', 'contentType': 'applicatio...",2025-07-12,True
4549,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '102', 'contentType': 'applicatio...",2025-07-12,True
4550,"{'actions': {'count': 1, 'url': 'https://api.c...","{'congress': '102', 'contentType': 'applicatio...",2025-07-12,True
1730,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '113', 'contentType': 'applicatio...",2025-07-12,True
1731,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '113', 'contentType': 'applicatio...",2025-07-12,True
1201,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '115', 'contentType': 'applicatio...",2025-07-12,True
75,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '118', 'contentType': 'applicatio...",2025-07-12,True
195,"{'actions': {'count': 11, 'url': 'https://api....","{'congress': '118', 'contentType': 'applicatio...",2025-07-12,True



Top duplicate records:


nomination
{'authorityDate': '2025-06-30', 'citation': 'PN379', 'congress': 119, 'description': 'Ademola Adewale-Sadik, of New York, to be United States Director of the African Development Bank for a term of five years, vice Oren E. Whyche-Shaw.', 'nominationType': {'isCivilian': True}, 'number': 379, 'partNumber': '00', 'receivedDate': '2025-06-30', 'updateDate': '2025-07-03T13:07:51Z'}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Then we can remove the duplicates, keeping the one with the most recent retrieval_date on the presumption it's the one most likely to have been corrected for accuracy server-side.

In [None]:
print(f"shape before checking for & dropping duplicated congressional nomination records is { dfs["cong_nominations"].shape}")

# First sort by retrieval_date in ascending order
# Then drop duplicates keeping the last occurrence (which will be the most recent date)
dfs["cong_nominations"] = (dfs["cong_nominations"]
                          .sort_values("retrieval_date")
                          .drop_duplicates(subset=["nomination", "request"], keep='last'))

print(f"shape after checking for & dropping duplicated congressional nomination records is { dfs["cong_nominations"].shape}")

shape before checking for & dropping duplicated congressional nomination records is (5801, 4)
shape after checking for & dropping duplicated congressional nomination records is (5569, 4)


Same logic for nominee dataframe, target column just has a different name

TODO: if this operation works well, refactor it to features.py taking dataframe and target column names as inputs

In [None]:
# ------------------------------------------------------------------
# 1. Identify the rows that *would* be dropped
dupe_mask = dfs["cong_nominees"].duplicated(subset=["nominee", "request"], keep="first")
dupes      = dfs["cong_nominees"].loc[dupe_mask].copy()

# ------------------------------------------------------------------
# 2. Show a compact summary
print(f"Rows flagged as duplicates: {len(dupes)}")
display(
    dupes.sort_values(["nominee", "request"])
         .head(20)   # show first 20; remove .head() to see all
)

# Optional: see how many duplicates per citation
dup_counts = (
    dfs["cong_nominees"]
      .loc[dupe_mask, "nominee"]
      .value_counts()
      .head(10)
)
print("\nTop duplicate records:")
display(dup_counts)

Rows flagged as duplicates: 155


Unnamed: 0,nominee,request,retrieval_date
119,"{'nominees': [{'firstName': 'Adam', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
778,"{'nominees': [{'firstName': 'Adam', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
336,"{'nominees': [{'firstName': 'Almo', 'lastName'...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
492,"{'nominees': [{'firstName': 'Andrew', 'lastNam...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1562,"{'nominees': [{'firstName': 'Arthur', 'lastNam...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
111,"{'nominees': [{'firstName': 'Benjamin', 'lastN...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1022,"{'nominees': [{'firstName': 'Bradley', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1106,"{'nominees': [{'firstName': 'Bradley', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
1108,"{'nominees': [{'firstName': 'Brendan', 'lastNa...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13
4914,"{'nominees': [{'firstName': 'Bruce', 'lastName...",{'url': 'https://api.congress.gov/v3/nominatio...,2025-07-13



Top duplicate records:


nominee
{'nominees': [{'firstName': 'Mary', 'lastName': 'Abrecht', 'middleName': 'Ellen', 'ordinal': 1, 'state': 'DC'}], 'pagination': {'count': 1}, 'request': {'congress': '101', 'contentType': 'application/json', 'format': 'json', 'number': '1181-0'}}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   7
{'nominees': [{'firstName': 'Robert Samuel', 'lastName': 'Tignor', 'ordinal': 7, 'state': 'DC'}, {'firstName': 'Emmet G.', 'lastName': 'Sullivan', 'ordinal': 6, 'state': 'DC'}, {'firstName': 'Robert Isaac', 'lastName': 'Richter', 'ordinal': 5, 's

In [None]:
print(f"shape before checking for & dropping duplicated congressional nominee records is { dfs["cong_nominees"].shape}")

# First sort by retrieval_date in ascending order
# Then drop duplicates keeping the last occurrence (which will be the most recent date)
dfs["cong_nominees"] = (dfs["cong_nominees"]
                          .sort_values("retrieval_date")
                          .drop_duplicates(subset=["nominee", "request"], keep='last'))

print(f"shape after checking for & dropping duplicated congressional nominee records is { dfs["cong_nominees"].shape}")

shape before checking for & dropping duplicated congressional nominee records is (5672, 3)
shape after checking for & dropping duplicated congressional nominee records is (5517, 3)


## Flatten JSON-containing congress DataFrames into tabular form

In [None]:

dfs["cong_nominations"] = flatten_json_dataframe(
    df=dfs["cong_nominations"],
    json_col="nomination",  # column containing the JSON data
    max_list_index=10,      # maximum number of list items to extract
    separator="_"           # separator for nested keys
)

dfs["cong_nominees"]= flatten_json_dataframe(
    df=dfs["cong_nominees"],
    json_col="nominee",
    max_list_index=5
)

[32m2025-07-15 12:34:02.058[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m316[0m - [1mFlattening JSON data from column 'nomination' in 5569 rows[0m
[32m2025-07-15 12:34:05.908[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m342[0m - [1mFlattening complete. Original columns: 4, New columns: 37[0m
[32m2025-07-15 12:34:05.911[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m316[0m - [1mFlattening JSON data from column 'nominee' in 5517 rows[0m
[32m2025-07-15 12:34:07.709[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mflatten_json_dataframe[0m:[36m342[0m - [1mFlattening complete. Original columns: 3, New columns: 34[0m


In [None]:
# Print summary of available dataframes
print("Available dataframes:")
for name, df in dfs.items():
    print(f"- {name}: {len(df)} rows × {len(df.columns)} columns")

Available dataframes:
- fjc_judges: 4022 rows × 201 columns
- fjc_federal_judicial_service: 4720 rows × 30 columns
- fjc_demographics: 4022 rows × 18 columns
- fjc_education: 8040 rows × 6 columns
- fjc_other_federal_judicial_service: 611 rows × 31 columns
- fjc_other_nominations_recess: 828 rows × 4 columns
- fjc_professional_career: 19003 rows × 4 columns
- cong_nominations: 5569 rows × 37 columns
- cong_nominees: 5517 rows × 34 columns


#### Quick peek at all loaded dataframes

In [None]:
logger.info("Checking for general shape and first handfuls of rows")
for name, df in dfs.items():
    print(f"{name:<35} → {df.shape}")
    print(df.head())  

[32m2025-07-15 12:34:07.734[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking for general shape and first handfuls of rows[0m
fjc_judges                          → (4022, 201)
        nid       jid last_name first_name middle_name suffix  birth_month  \
0  13761857  13761857   Abelson       Adam         Ben    NaN          NaN   
1   1393931      3419    Abrams     Ronnie                             NaN   
2   1376976         1   Abruzzo    Matthew          T.                 4.0   
3  13651551  13651551     Abudu      Nancy       Gbana    NaN          NaN   
4   1376981         2   Acheson     Marcus      Wilson                 6.0   

   birth_day birth_year  birth_city  ... degree_(3)  degree_year_(3)  \
0        NaN       1982   Cleveland  ...        NaN              NaN   
1        NaN       1968    New York  ...        NaN              NaN   
2       30.0       1889    Brooklyn  ...        NaN              NaN   
3        NaN       1974  Ale

In [None]:
logger.info("Checking for null values")
    
for name, df in dfs.items():
    print(df.isnull().sum())

[32m2025-07-15 12:34:07.817[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking for null values[0m
nid                                         0
jid                                         0
last_name                                   0
first_name                                  0
middle_name                                35
                                         ... 
school_(5)                               4017
degree_(5)                               4018
degree_year_(5)                          4017
professional_career                         4
other_nominations/recess_appointments    3307
Length: 201, dtype: int64
nid                                     0
sequence                                0
judge_name                              0
court_type                              0
court_name                              0
appointment_title                       0
appointing_president                    0
party_of_appointing_president         

## Data cleaning

### Drop duplicated rows (if any) from FJC data

In [None]:
for name, df in dfs.items():
    if name.startswith("fjc_") and name not in ("fjc_judges", "fjc_demographics", "fjc_other_nominations_recess"):
        dfs[name] = dfs[name].drop_duplicates(subset=["nid", "sequence"])
        print(f"shape of {name} after checking for & dropping duplicated nid sequenced items is { dfs[name].shape}")

shape of fjc_federal_judicial_service after checking for & dropping duplicated nid sequenced items is (4720, 30)
shape of fjc_education after checking for & dropping duplicated nid sequenced items is (8040, 6)
shape of fjc_other_federal_judicial_service after checking for & dropping duplicated nid sequenced items is (611, 31)
shape of fjc_professional_career after checking for & dropping duplicated nid sequenced items is (19003, 4)


### Dropping completely-empty columns

In [None]:
#for name, df in dfs.items():
#    empty_cols = df.columns[df.isna().all()]
#    print(f"About to drop {len(empty_cols)} completely-empty columns from {name}:")
#    for c in empty_cols:
#        print("  •", c)
#    df.dropna(how='all', axis=1, inplace=True)

### Dropping unhelpfully uninformative columns

I define these as columns that are fully populated with the same value and wouldn't help modeling.  This still keeps columns if most of it is missing but the one value which is present is the same wherever it's present.

In [None]:
from nomination_predictor.features import \
    drop_unhelpfully_uninformative_columns

for name, df in dfs.items():
    df = drop_unhelpfully_uninformative_columns(df)
    dfs[name] = df

Columns with limited unique values:
  - aba_rating_(4): 1 unique non-null value 'Well Qualified' (0.0% of rows) - KEEPING
  - appointing_president_(5): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointing_president_(6): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointment_title_(4): 1 unique non-null value 'Judge' (0.3% of rows) - KEEPING
  - appointment_title_(5): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - appointment_title_(6): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - ayes/nays_(4): 1 unique non-null value '  ' (0.3% of rows) - KEEPING
  - ayes/nays_(5): 1 unique non-null value '  ' (0.0% of rows) - KEEPING
  - ayes/nays_(6): 1 unique non-null value '  ' (0.0% of rows) - KEEPING


  - commission_date_(5): 1 unique non-null value '1949-02-02' (0.0% of rows) - KEEPING
  - commission_date_(6): 1 unique non-null value '1949-02-02' (0.0% of rows) - KEEPING
  - committee_action_date_(5): 1 unique non-null value '1949-01-27' (0.0% of rows) - KEEPING
  - committee_action_date_(6): 1 unique non-null value '1949-01-27' (0.0% of rows) - KEEPING
  - committee_referral_date_(5): 1 unique non-null value '1949-01-13' (0.0% of rows) - KEEPING
  - committee_referral_date_(6): 1 unique non-null value '1949-01-13' (0.0% of rows) - KEEPING
  - confirmation_date_(5): 1 unique non-null value '1949-01-31' (0.0% of rows) - KEEPING
  - confirmation_date_(6): 1 unique non-null value '1949-01-31' (0.0% of rows) - KEEPING
  - court_name_(5): 1 unique non-null value 'U.S. District Court for the Eastern District of Missouri' (0.0% of rows) - KEEPING
  - court_name_(6): 1 unique non-null value 'U.S. District Court for the Western District of Missouri' (0.0% of rows) - KEEPING
  - court_type_(

### Normalize column names for DataFrames

In [None]:
print("=== Column Names Before ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

=== Column Names Before ===
fjc_judges                          → ['nid', 'jid', 'last_name', 'first_name', 'middle_name', 'suffix', 'birth_month', 'birth_day', 'birth_year', 'birth_city', 'birth_state', 'death_month', 'death_day', 'death_year', 'death_city', 'death_state', 'gender', 'race_or_ethnicity', 'court_type_(1)', 'court_name_(1)', 'appointment_title_(1)', 'appointing_president_(1)', 'party_of_appointing_president_(1)', 'reappointing_president_(1)', 'party_of_reappointing_president_(1)', 'aba_rating_(1)', 'seat_id_(1)', 'statute_authorizing_new_seat_(1)', 'recess_appointment_date_(1)', 'nomination_date_(1)', 'committee_referral_date_(1)', 'hearing_date_(1)', 'judiciary_committee_action_(1)', 'committee_action_date_(1)', 'senate_vote_type_(1)', 'ayes/nays_(1)', 'confirmation_date_(1)', 'commission_date_(1)', 'service_as_chief_judge,_begin_(1)', 'service_as_chief_judge,_end_(1)', '2nd_service_as_chief_judge,_begin_(1)', '2nd_service_as_chief_judge,_end_(1)', 'senior_status_date_(

In [None]:
# call features.py's normalize_columns function on all DataFrames in dfs, and strip leading and trailing whitespace in all strings
from nomination_predictor.features import normalize_dataframe_columns

for name, df in dfs.items():
    df = normalize_dataframe_columns(df)
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    dfs[name] = df

In [None]:
print("=== Column Names After ===")

for name, df in dfs.items():
    print(f"{name:<35} → {df.columns.tolist()}")

=== Column Names After ===
fjc_judges                          → ['nid', 'jid', 'last_name', 'first_name', 'middle_name', 'suffix', 'birth_month', 'birth_day', 'birth_year', 'birth_city', 'birth_state', 'death_month', 'death_day', 'death_year', 'death_city', 'death_state', 'gender', 'race_or_ethnicity', 'court_type_(1)', 'court_name_(1)', 'appointment_title_(1)', 'appointing_president_(1)', 'party_of_appointing_president_(1)', 'reappointing_president_(1)', 'party_of_reappointing_president_(1)', 'aba_rating_(1)', 'seat_id_(1)', 'statute_authorizing_new_seat_(1)', 'recess_appointment_date_(1)', 'nomination_date_(1)', 'committee_referral_date_(1)', 'hearing_date_(1)', 'judiciary_committee_action_(1)', 'committee_action_date_(1)', 'senate_vote_type_(1)', 'ayes/nays_(1)', 'confirmation_date_(1)', 'commission_date_(1)', 'service_as_chief_judge,_begin_(1)', 'service_as_chief_judge,_end_(1)', '2nd_service_as_chief_judge,_begin_(1)', '2nd_service_as_chief_judge,_end_(1)', 'senior_status_date_(1

### Left-merge nominees table onto nominations table
This will make the cong_noms dataframe we'll use for most of our congress data operations below in this notebook.

In [None]:
from nomination_predictor.features import merge_nominees_onto_nominations

dfs["cong_noms"] = pd.DataFrame() # initialize at this scope so the one created in the below "try" block persists for further-down notebook cells


try:
    # Assuming cong_nominations and cong_nominees dataframes are already loaded
    dfs["cong_noms"] = merge_nominees_onto_nominations(dfs["cong_nominations"], dfs["cong_nominees"])
    
    
    # Report on the merge results
    logger.info(f"Original nominations shape: {dfs['cong_nominations'].shape}")
    logger.info(f"Original nominees shape: {dfs['cong_nominees'].shape}")
    logger.info(f"Merged dataframe shape: {dfs['cong_noms'].shape}")
    
except Exception as e:
    logger.error(f"Error in merge process: {e}")
    
# Show sample of the merged dataframe
display(dfs["cong_noms"].head())

[32m2025-07-15 12:34:09.055[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m531[0m - [1mExtracted 5517 URLs from nominees request column (100.0% of rows)[0m
[32m2025-07-15 12:34:09.056[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m555[0m - [1mNominations dataframe has 5514 non-null URLs (99.0% of rows)[0m
[32m2025-07-15 12:34:09.072[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m571[0m - [1mMerged dataframe has 5569 rows[0m
[32m2025-07-15 12:34:09.072[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mmerge_nominees_onto_nominations[0m:[36m572[0m - [1mSuccessfully matched 5514 nominations with nominees (99.0%)[0m
[32m2025-07-15 12:34:09.073[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mOriginal nominations shape: (5569, 34)[0m
[3

Unnamed: 0,request,retrieval_date,actions_count,actions_url,citation,committees_count,committees_url,congress,description,latestaction_actiondate,...,nominees_3_ordinal,nominees_3_state,nominees_4_firstname,nominees_4_lastname,nominees_4_ordinal,nominees_4_state,nominees_3_suffix,nominees_1_middlename,nominees_3_middlename,nominees_4_middlename
0,"{'congress': '107', 'contentType': 'applicatio...",2025-07-12,5.0,https://api.congress.gov/v3/nomination/107/189...,PN1891,1.0,https://api.congress.gov/v3/nomination/107/189...,107,"Burton Stallwood, of Rhode Island, to be Unite...",2002-08-01,...,,,,,,,,,,
1,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,2.0,https://api.congress.gov/v3/nomination/104/649...,PN649,1.0,https://api.congress.gov/v3/nomination/104/649...,104,"James William Blagg, of Texas, to be United St...",1996-10-04,...,,,,,,,,,,
2,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/652...,PN652,1.0,https://api.congress.gov/v3/nomination/104/652...,104,"Anthony Cecil Eden Quainton, of the District o...",1995-12-27,...,,,,,,,,,,
3,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/658...,PN658,1.0,https://api.congress.gov/v3/nomination/104/658...,104,"Patricia A. Gaughan, of Ohio, to be United Sta...",1995-12-22,...,,,,,,,,,,
4,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/659...,PN659,1.0,https://api.congress.gov/v3/nomination/104/659...,104,"Joan A. Lenard, of Florida, to be United State...",1995-12-22,...,,,,,,,,,,


### Drop duplicated rows from congressional data

Each presidential nomination has one citation.  The first person they nominate would have citation PN1, the thousandth would have PN1000, etc.
This section exists to see (and deal with) we now have multiple rows listing the same citation number and date on which the nomination was received from the President.

In [None]:
# commenting out because this no longer seems necessary -- in fact it errors out instead -- ever since having thought to add the cell for dupe-clearing before JSON flattening.

## ------------------------------------------------------------------
## 1. Identify the rows that *would* be dropped
#dupe_mask = dfs["cong_noms"].duplicated(subset=["citation", "receiveddate"], keep="first")
#dupes      = dfs["cong_noms"].loc[dupe_mask].copy()
#
## ------------------------------------------------------------------
## 2. Show a compact summary
#print(f"Rows flagged as duplicates: {len(dupes)}")
#display(
#    dupes.sort_values(["citation", "receiveddate"])
#         .head(20)   # show first 20; remove .head() to see all
#)
#
## Optional: see how many duplicates per citation
#dup_counts = (
#    dfs["cong_noms"]
#      .loc[dupe_mask, "citation"]
#      .value_counts()
#      .head(10)
#)
#print("\nTop duplicate citations:")
#display(dup_counts)
#
#print(f"shape before checking for & dropping duplicated citation items is { dfs["cong_noms"].shape}")
#dfs["cong_noms"] = dfs["cong_noms"].drop_duplicates(subset=["citation", "receiveddate"])
#print(f"shape after checking for & dropping duplicated citation items is { dfs["cong_noms"].shape}")

### Drop rows whose congressional citations end in -0
All of these I've seen either:

- lack strictly-necessary information such as nomination & confirmation dates,
- lack helpful information such as the person's name, or 
- whatever little information they do have indicates it's not for a position as a judge (e.g. for secretary of defense, assistant secretary to something-or-other-, etc.) 

In [None]:
from nomination_predictor.features import filter_dash_zero_citations

what_the_heck = dfs["cong_nominations"]
what_the_hell = dfs["cong_nominees"]
dfs["cong_noms"] = filter_dash_zero_citations(dfs["cong_noms"])

[32m2025-07-15 12:34:09.126[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_dash_zero_citations[0m:[36m221[0m - [1mFound 70 citations ending with '-0'[0m
[32m2025-07-15 12:34:09.133[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_dash_zero_citations[0m:[36m227[0m - [1mRemoved 70/5569 records with '-0' citations[0m


### Drop non-judge nominations based on position title

In [None]:
# Filter out non-judicial nominations using the function from features.py
from nomination_predictor.features import filter_non_judicial_nominations

# Define non-judicial titles to filter out
non_judicial_titles = [
    "Attorney", "Board", "Commission", "Director", "Marshal",
    "Assistant", "Representative", "Secretary of", "Member of"
]

dfs["cong_noms"] = filter_non_judicial_nominations(dfs["cong_noms"], non_judicial_titles=non_judicial_titles)

[32m2025-07-15 12:34:09.160[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_non_judicial_nominations[0m:[36m187[0m - [1mFound 1320 unique citations with non-judicial titles[0m
[32m2025-07-15 12:34:09.164[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mfilter_non_judicial_nominations[0m:[36m193[0m - [1mRemoved 4079/5499 corresponding records[0m


### Populate a few new, cleaner columns from straightforwardly-parsable data

In [None]:
from nomination_predictor.name_matching import fill_vacancy_reason_column

dfs["cong_noms"] = fill_vacancy_reason_column(dfs["cong_noms"])

[32m2025-07-15 12:34:09.190[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mfill_vacancy_reason_column[0m:[36m404[0m - [1mExtracted 1420 vacancy reasons from descriptions[0m


In [None]:
# fill missing values from predecessor column
from nomination_predictor.name_matching import fill_predecessor_column

dfs["cong_noms"] = fill_predecessor_column(dfs["cong_noms"])

In [None]:
# create a new column in dfs["fjc_federal_judicial_service"] titled "fjc_biography_url"
# whose contents are simply the string "http://www.fjc.gov/node/" concatenated before whatever integer # can be read from 
# that same row's "nid" column (or empty string if nid is not an integer)

svc = dfs["fjc_federal_judicial_service"]

svc["fjc_biography_url"] = (
    "http://www.fjc.gov/node/" +
    pd.to_numeric(svc["nid"], errors="coerce")     # turn non-ints into NaN
      .dropna()                                    # keep only numeric nids
      .astype(int)                                 # cast to int for clean string
      .astype(str)   
)
dfs["fjc_federal_judicial_service"] = svc

### Convert date strings to datetime objects

In [None]:
# for any columns which contain certain keywords in their column name and contain string values, convert from string to datetime
datetime_related_keywords = ("date", "year", "month")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col for keyword in datetime_related_keywords) and df[col].dtype == "object":
            logger.info(f"Converting {col} to datetime for {name}")
            df[col] = pd.to_datetime(df[col], errors="coerce")

[32m2025-07-15 12:34:09.244[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting birth_year to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.249[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting recess_appointment_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.252[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting nomination_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.259[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting committee_referral_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.262[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting hearing_date_(1) to datetime for fjc_judges[0m


[32m2025-07-15 12:34:09.267[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting committee_action_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.270[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting confirmation_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.272[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting commission_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.274[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting senior_status_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.277[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting termination_date_(1) to datetime for fjc_judges[0m
[32m2025-07-15 12:34:09.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mConverting recess_appointment_date_(

### Normalize several columns' string values to make matching them later easier

In [None]:
from nomination_predictor.name_matching import normalize_text

keywords_which_denote_string_columns_to_normalize = ("court", "circuit", "district", "description", "name")

for name, df in dfs.items():
    for col in df.columns:
        if any(keyword in col.casefold() for keyword in keywords_which_denote_string_columns_to_normalize) and df[col].dtype == object:
            logger.info(F"Normalizing all values within column named {col} in {name}")
            df[col] = df[col].apply(normalize_text)

[32m2025-07-15 12:34:09.509[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named last_name in fjc_judges[0m
[32m2025-07-15 12:34:09.518[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named first_name in fjc_judges[0m
[32m2025-07-15 12:34:09.527[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named middle_name in fjc_judges[0m
[32m2025-07-15 12:34:09.534[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named court_type_(1) in fjc_judges[0m
[32m2025-07-15 12:34:09.542[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNormalizing all values within column named court_name_(1) in fjc_judges[0m
[32m2025-07-15 12:34:44.058[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8

### Count and display columns for which no value is unique:
looks like our left-merge of the two congress dataframes causes some blank columns, and we still have some that are populated but unhelpful..

In [None]:
# display counts of unique values in DataFrame columns:
for name, df in dfs.items():
    for col in sorted(df.columns):
        if df[col].nunique() <1:
            print(f"{name} - {col}: {df[col].nunique()} unique values")

fjc_judges - 2nd_service_as_chief_judge,_begin_(2): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_begin_(3): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_begin_(4): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_begin_(5): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_begin_(6): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_end_(2): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_end_(3): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_end_(4): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_end_(5): 0 unique values
fjc_judges - 2nd_service_as_chief_judge,_end_(6): 0 unique values
fjc_judges - aba_rating_(5): 0 unique values
fjc_judges - aba_rating_(6): 0 unique values
fjc_judges - party_of_reappointing_president_(2): 0 unique values
fjc_judges - party_of_reappointing_president_(3): 0 unique values
fjc_judges - party_of_reappointing_president_(4): 0 unique values
fjc_judges - party_of_reappointing_preside

...so now's an okay time to delete them

In [None]:
for name, df in dfs.items():
    df = drop_unhelpfully_uninformative_columns(df)
    dfs[name] = df

Columns with limited unique values:
  - aba_rating_(4): 1 unique non-null value 'Well Qualified' (0.0% of rows) - KEEPING
  - appointing_president_(5): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointing_president_(6): 1 unique non-null value 'Harry S Truman' (0.0% of rows) - KEEPING
  - appointment_title_(4): 1 unique non-null value 'Judge' (0.3% of rows) - KEEPING
  - appointment_title_(5): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - appointment_title_(6): 1 unique non-null value 'Judge' (0.0% of rows) - KEEPING
  - ayes/nays_(4): 1 unique non-null value '' (0.3% of rows) - KEEPING
  - ayes/nays_(5): 1 unique non-null value '' (0.0% of rows) - KEEPING
  - ayes/nays_(6): 1 unique non-null value '' (0.0% of rows) - KEEPING
  - commission_date_(5): 1 unique non-null value '1949-02-02 00:00:00' (0.0% of rows) - KEEPING
  - commission_date_(6): 1 unique non-null value '1949-02-02 00:00:00' (0.0% of rows) - KEEPING
  - committee_action_date_(5)

## Name-matching FJC judges to Congress.gov nominees

### For confirmed judges

#### Supplementing with additional columns to aid matching

In [None]:
# add a "full_name_from_description" and a "location_of_origin_from_description" columns to the dfs["cong_noms"] dataframe which regex-captures the first segments of the same dfs["cong_noms"] dataframe row's "description" string, 
# i.e. captures name before the first appearances of the phrases ", of " or ", of the "
# and captures location from the second segment of the same dfs["cong_noms"] dataframe row's "description" string
# i.e. captures between the above-seen phrase ", of " or ", of the " through to the phrase ", to be "
# examples: 
# melissa damian, of florida, to be ...  gets captured into those new columns as "melissa damian" and "florida"
# nicole g. bernerr of maryland, to be united... gets captured into those new columns as "nicole g. bernerr" and "maryland"
# kirk edward sherriff, of california, to be united... gets captured into those new columns as "kirk edward sherriff" and "california"
# sherri malloy beatty-arthur, of the district of columbia, for... gets captured into those new columns as "sherri malloy beatty-arthur" and "district of columbia"

# Extract full_name_from_description and location_of_origin_from_description from description field
from nomination_predictor.features import extract_name_and_location_columns

# Apply the extraction function to cong_noms dataframe
if 'cong_noms' in dfs:
    dfs['cong_noms'] = extract_name_and_location_columns(dfs['cong_noms'])
    
    # Display sample results to verify extraction
    sample_cols = ['description', 'full_name_from_description', 'location_of_origin_from_description']
    display(dfs['cong_noms'][sample_cols].head(10))
    
    # Report extraction statistics
    total_rows = len(dfs['cong_noms'])
    name_filled = dfs['cong_noms']['full_name_from_description'].notna().sum()
    location_filled = dfs['cong_noms']['location_of_origin_from_description'].notna().sum()
    
    logger.info(f"Extracted names for {name_filled}/{total_rows} records ({name_filled/total_rows:.1%})")
    logger.info(f"Extracted locations for {location_filled}/{total_rows} records ({location_filled/total_rows:.1%})")
else:
    logger.error("Error: 'cong_noms' dataframe not found in dfs dictionary.")

[32m2025-07-15 12:34:10.269[0m | [1mINFO    [0m | [36mnomination_predictor.features[0m:[36mextract_name_and_location_columns[0m:[36m700[0m - [1mExtracted 1420/1420 (100.0%) names and 1420/1420 (100.0%) locations[0m


Unnamed: 0,description,full_name_from_description,location_of_origin_from_description
7,"joseph h. gale, of virginia, to be a judge of ...",joseph h. gale,virginia
11,"nina gershon, of new york, to be united states...",nina gershon,new york
12,"barbara s. jones, of new york, to be united st...",barbara s. jones,new york
14,"nanette k. laughrey, of missouri, to be united...",nanette k. laughrey,missouri
15,"charles r. stack, of florida, to be united sta...",charles r. stack,florida
19,"jed s. rakoff, of new york, to be united state...",jed s. rakoff,new york
24,"michael r. murphy, of utah, to be united state...",michael r. murphy,uta
26,"bruce w. greer, of florida, to be united state...",bruce w. greer,florida
29,"bruce d. black, of new mexico, to be united st...",bruce d. black,new mexico
35,"hugh lawson, of georgia, to be united states d...",hugh lawson,georgia


[32m2025-07-15 12:34:10.283[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mExtracted names for 1420/1420 records (100.0%)[0m
[32m2025-07-15 12:34:10.284[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mExtracted locations for 1420/1420 records (100.0%)[0m


#### Performing the matching operations

In [None]:
from nomination_predictor.name_matching import perform_exact_name_matching

results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices= perform_exact_name_matching(
    congress_df=dfs["cong_noms"],
    fjc_df=dfs["fjc_federal_judicial_service"],
    congress_name_col="full_name_from_description",
    fjc_name_col="judge_name"
)

[32m2025-07-15 12:34:10.298[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m123[0m - [1mStarting exact name matching with 1420 Congress records and 4720 FJC records[0m
[32m2025-07-15 12:34:10.664[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m142[0m - [1mPerforming first-pass join on last and first name[0m
[32m2025-07-15 12:34:10.680[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m149[0m - [1mFound 954 total records with last+first name matches[0m
[32m2025-07-15 12:34:10.685[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m177[0m - [1mFound 22 ambiguous matches, attempting middle initial disambiguation[0m
[32m2025-07-15 12:34:10.686[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_na

In [None]:
# Show results
results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices.head()

Unnamed: 0,congress_index,congress_name,fjc_name,nid,match_type,ambiguous
0,11,nina gershon,"gershon, nina",1381126.0,first_and_last_name,False
1,12,barbara s. jones,"jones, barbara s.",1382891.0,first_and_last_name,False
2,14,nanette k. laughrey,"laughrey, nanette kay",1383666.0,first_and_last_name,False
3,19,jed s. rakoff,"rakoff, jed saul",1386686.0,first_and_last_name,False
4,24,michael r. murphy,"murphy, michael r.",1385541.0,first_and_last_name,False


In [None]:
# keep only *unambiguous* pairs
nid_map = (
    results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices[~results_of_name_matching_to_bridge_nids_to_congress_dataframe_indices["ambiguous"]]        # drop rows still ambiguous
      .set_index("congress_index")["nid"]
)

In [None]:
# at long last, we have a way to bridge the gap between the congress.gov data and the fjc data

# we can now use the nid_map to add the nid column to the congress.gov data
dfs["cong_noms"]["nid"] = dfs["cong_noms"].index.to_series().map(nid_map)
cong_noms = dfs["cong_noms"]

### For unconfirmed judges

In practice, given the dataframes as I've got them as of typing this, this section doesn't find any remaining unconfirmed judges to match.

What this section _did_ accomplish was showing me that the presence of diacritical marks such as "ñ" or "é" in names was misleading the matching process.

Discovering and addressing that in much-earlier data-normalizing cells led to getting more matches in our confirmed-judges-matching notebook section.

#### Supplementing with additional columns to aid matching

In [None]:
from nomination_predictor.name_matching import prep_fjc_other

dfs["fjc_other_nominations_recess"] = prep_fjc_other(fjc_other_df=dfs["fjc_other_nominations_recess"])

In [None]:
fjc_other_supplemented =dfs["fjc_other_nominations_recess"]

#### Inspecting for the unconfirmed nominee matching possibilities

In [None]:
from nomination_predictor.features import link_unconfirmed_nominations

dfs["cong_noms"] = link_unconfirmed_nominations(dfs["cong_noms"], dfs["fjc_other_nominations_recess"])

[32m2025-07-15 12:34:11.105[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m123[0m - [1mStarting exact name matching with 252 Congress records and 828 FJC records[0m
[32m2025-07-15 12:34:11.160[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m142[0m - [1mPerforming first-pass join on last and first name[0m
[32m2025-07-15 12:34:11.169[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m149[0m - [1mFound 0 total records with last+first name matches[0m
[32m2025-07-15 12:34:11.169[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperform_exact_name_matching[0m:[36m153[0m - [1mNO last+first name matches found. Checking last-name-only matches for diagnosis...[0m
[32m2025-07-15 12:34:11.174[0m | [1mINFO    [0m | [36mnomination_predictor.name_matching[0m:[36mperfor

#### Deciding not to merge the "Other Nominations" dataframe (at least not yet)
In theory this could get us the fjc's perspective on more nominees who didn't get confirmed.  

In practice my runs of name-matching didn't find any unambiguous matches from this dataframe to the congress one.  

So I put the idea of merging it, too, on hold.  Can try another day, maybe after seeing whether the additional data would help, or if ever discovering something major could be fixed/improved about the name matcher.

## Combining the rest of the FJC data now that our congress dataframe has been enriched with FJC nid

### Handling nominees' education and job history

Before we combine FJC data, we have to consider whether/how to handle judges' education, job history, age, ABA rating, etc.  Most/all of the data in the "demographics" dataframe is unchanging over time, but that's very much _not_ true of the other dataframes.

The simplest way to handle it would be to left-merge on "nid" and only take the most recently-dated row, or row with the highest sequence number.  In most cases this would likely land on keeping the most prestigious degree or job.

However, it is entirely likely a judge's education or job history has changed substantially since their first nomination, and affected their qualifications for each later nomination.

All of these indicate to me that it's worth merging onto each row that judge's position, education, etc., not as of the most recent records available, but instead _as of when they were nominated._

That means we can't do a too-simple left-join of all of our FJC data.  Instead, now that we've done the step of matching NIDs to congress' data on nominations, we can use the "received date" for each congress citation as a cutoff date for when we lookup education and job records by "nid" -- so we can avoid mistakenly linking to a citation any employment & job records dated after that cutoff date.

Thankfully we do have the school, degree, and degree_year in the education record, for both their bachelors and their masters and their associate degree(s) and LLB and J.D. etc., so we can look that up.  The education dataframe even comes with a "sequence" number for each education record, which is another indicator of chronological order in addition to degree_year for any given "nid" lookup for a judge.

Job history is more challenging to deal with because literally every row entry in that dataframe lists it uniquely, but we do have the data available.  My earliest attempts to feature-engineer with it include looking for keywords in it, then creating boolean features for whether they did/didn't have experience in common-phrase-identifiable positions such as "Private practice" or "Attorney general" or "Navy" or "Army" etc. Theoretically a parser can look for the year spreads listed there as a rough indicator of amounts of experience gleaned from each professional role & when, but that may be too complicated for me to accomplish by the time I'm first presenting this work.

In [None]:

import pandas as pd

from nomination_predictor.time_aware_analysis import merge_congress_fjc

# Perform the time-aware merge
time_aware_merged_df = merge_congress_fjc(dfs["cong_noms"], dfs["fjc_judges"], dfs["fjc_demographics"],
                                                  dfs["fjc_education"], dfs["fjc_professional_career"], dfs["fjc_federal_judicial_service"])
display(time_aware_merged_df.head(5))
logger.info("Time-aware merge completed.  New dataframe shown above should include both congress' nomination data and FJC's demographics & education & professional & other federal service records (e.g. columns degree_(1), degree_(2), etc.)")

  merged[col] = merged[col].fillna(False)


Unnamed: 0,request,retrieval_date,actions_count,actions_url,citation,committees_count,committees_url,congress,description,latestaction_actiondate,...,degree_(4),degree_year_(4),school_(5),degree_(5),degree_year_(5),professional_career,other_nominations/recess_appointments,jid_fjcdemo,gender_fjcdemo,race_or_ethnicity_fjcdemo
0,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/666...,PN666,1.0,https://api.congress.gov/v3/nomination/104/666...,104,"joseph h. gale, of virginia, to be a judge of ...",1995-12-22,...,,,,,,,,,,
1,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/692...,PN692,1.0,https://api.congress.gov/v3/nomination/104/692...,104,"nina gershon, of new york, to be united states...",1996-07-30,...,,,,,,"Staff attorney, Mental Health Information Serv...",,836.0,Female,White
2,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/693...,PN693,1.0,https://api.congress.gov/v3/nomination/104/693...,104,"barbara s. jones, of new york, to be united st...",1995-12-22,...,,,,,,"Special attorney, Organized Crime and Racketee...",,1192.0,Female,White
3,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,6.0,https://api.congress.gov/v3/nomination/104/696...,PN696,1.0,https://api.congress.gov/v3/nomination/104/696...,104,"nanette k. laughrey, of missouri, to be united...",1996-07-24,...,,,,,,"Assistant attorney general, State of Missouri,...",,1349.0,Female,White
4,"{'congress': '104', 'contentType': 'applicatio...",2025-07-12,3.0,https://api.congress.gov/v3/nomination/104/698...,PN698,1.0,https://api.congress.gov/v3/nomination/104/698...,104,"charles r. stack, of florida, to be united sta...",1996-05-13,...,,,,,,,,,,


[32m2025-07-15 12:34:12.684[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mTime-aware merge completed.  New dataframe shown above should include both congress' nomination data and FJC's demographics & education & professional & other federal service records (e.g. columns degree_(1), degree_(2), etc.)[0m


### Condensing career history columns

Because I want to get rid of a bunch of columns that are very empty for most rows, yet very relevant for the few rows where they're still populated

Before we get rid of these largely-empty columns, gather the durations of roles in case we want to visualize things like typical role durations for a given seat type, whether roles tend to get longer vs. briefer with age, whether roles last longer in some court types or circuits than others, etc.

For each row, sum the durations between confirmation date and termination date for each of the times they've had a job in federal service so far, put that datetime-diff or timediff or whatever-the-proper-data-type-is sum into a new "days_federal_service_experience" column, put the number of jobs they had into a "prior_fed_roles_count" column.

Then finally delete the no-longer-needed largely-empty columns we condensed.

In [None]:
from nomination_predictor.career_history_calculator import \
    add_federal_service_features

time_aware_merged_df = add_federal_service_features(
    time_aware_merged_df,           # the big frame with both fjc and congress data
    received_col="receiveddate",
    drop_original=True       # drop >95 % empty wide columns
)
print(time_aware_merged_df.head())

# FIXME: confirm whether or not this cell works as intended

                                             request retrieval_date  \
0  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
1  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
2  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
3  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
4  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   

   actions_count                                        actions_url citation  \
0            6.0  https://api.congress.gov/v3/nomination/104/666...    PN666   
1            6.0  https://api.congress.gov/v3/nomination/104/692...    PN692   
2            6.0  https://api.congress.gov/v3/nomination/104/693...    PN693   
3            6.0  https://api.congress.gov/v3/nomination/104/696...    PN696   
4            3.0  https://api.congress.gov/v3/nomination/104/698...    PN698   

   committees_count                                     committees_url  \
0               1.

### Feature engineering 

In [None]:
from datetime import date

from nomination_predictor.time_aware_analysis import (
    congress_number, congress_session, days_into_current_term,
    days_until_next_midterm_election, days_until_next_presidential_election,
    fill_missing_appointing_presidents,
    fill_missing_party_of_appointing_presidents, normalize_party_codes,
    presidential_term_index)

df = time_aware_merged_df.copy()
df["receiveddate"] = pd.to_datetime(df["receiveddate"])   # ensure datetime

#### Primary target variable to train a model to predict:
number of days between receiveddate and latestaction_actiondate

In [None]:
print("Calculating days from nomination to latest action...")

# Initialize the new column
df['days_nom_to_latest_action'] = pd.NA

# Create mask for rows with both dates available
valid_dates_mask = df['receiveddate'].notna() & df['latestaction_actiondate'].notna()

if valid_dates_mask.any():
    # Calculate the difference in days
    df.loc[valid_dates_mask, 'days_nom_to_latest_action'] = (
        (df.loc[valid_dates_mask, 'latestaction_actiondate'] - 
         df.loc[valid_dates_mask, 'receiveddate']).dt.days
    )
    
    # Optional: Handle negative values (if any latest action dates are before received dates)
    negative_days_mask = df['days_nom_to_latest_action'] < 0
    if negative_days_mask.any():
        logger.warning(f"{negative_days_mask.sum()} rows have negative duration (latest action before received date)")
        logger.warning("Sample of problematic rows:")
        display(df.loc[negative_days_mask, ['receiveddate', 'latestaction_actiondate', 'days_nom_to_latest_action']].head(3))
        
        # You can decide to either keep negative values or set them to NA
        # df.loc[negative_days_mask, 'days_nom_to_latest_action'] = pd.NA  # Uncomment to remove negative values

# Print summary statistics
days_count = df['days_nom_to_latest_action'].notna().sum()
logger.info(f"Successfully calculated duration for {days_count} nominees ({days_count/len(df):.1%} of dataset)")

if days_count > 0:
    logger.info(f"Duration statistics (days):")
    logger.info(f"- Min: {df['days_nom_to_latest_action'].min()} days")
    logger.info(f"- Max: {df['days_nom_to_latest_action'].max()} days")
    logger.info(f"- Mean: {df['days_nom_to_latest_action'].mean():.1f} days")
    logger.info(f"- Median: {df['days_nom_to_latest_action'].median():.1f} days")

Calculating days from nomination to latest action...
[32m2025-07-15 12:34:13.039[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mSuccessfully calculated duration for 1417 nominees (99.8% of dataset)[0m
[32m2025-07-15 12:34:13.039[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mDuration statistics (days):[0m
[32m2025-07-15 12:34:13.040[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1m- Min: 0 days[0m
[32m2025-07-15 12:34:13.042[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1m- Max: 727 days[0m
[32m2025-07-15 12:34:13.043[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m- Mean: 146.2 days[0m
[32m2025-07-15 12:34:13.044[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1m- Median: 118.0 days[0m


In [None]:
from nomination_predictor.latestaction_parser import enrich_latest_action

dfs["cong_noms"] = enrich_latest_action(dfs["cong_noms"])

In [None]:
# presidency- and elections-timeline-related
df["pres_term_idx"]  = df["receiveddate"].apply(presidential_term_index)
df["days_into_pres_term"] = df["receiveddate"].apply(days_into_current_term)
df["days_to_next_pres_election"] = df["receiveddate"].apply(days_until_next_presidential_election)
df["days_to_next_midterm_election"]  = df["receiveddate"].apply(days_until_next_midterm_election)
df["congress_num"] = df["receiveddate"].apply(congress_number)
df["congress_session"] = df["receiveddate"].apply(congress_session)

In [None]:
print(dfs["cong_noms"].head())

                                              request retrieval_date  \
7   {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
11  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
12  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
14  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   
15  {'congress': '104', 'contentType': 'applicatio...     2025-07-12   

    actions_count                                        actions_url citation  \
7             6.0  https://api.congress.gov/v3/nomination/104/666...    PN666   
11            6.0  https://api.congress.gov/v3/nomination/104/692...    PN692   
12            6.0  https://api.congress.gov/v3/nomination/104/693...    PN693   
14            6.0  https://api.congress.gov/v3/nomination/104/696...    PN696   
15            3.0  https://api.congress.gov/v3/nomination/104/698...    PN698   

    committees_count                                     committees_url  \
7    

In [None]:
df = fill_missing_appointing_presidents(df) 

df = fill_missing_party_of_appointing_presidents(df)

df = normalize_party_codes(df, party_columns=['party_of_appointing_president_(1)', 'senate_party', 'house_party'])

# Show some examples for each seat level for validation
print("\nExample nominations from each appointing president:")
for prez in df["appointing_president_(1)"].unique():
    # Sample data - get both description and receiveddate 
    sample_rows = df[df["appointing_president_(1)"] == prez].sample(
        min(2, df[df["appointing_president_(1)"] == prez].shape[0])
    )
    
    print(f"\n{prez.upper()} examples:")
    for _, row in sample_rows.iterrows():
        # Format the date nicely
        date_str = row["receiveddate"].strftime("%B %d, %Y") if pd.notna(row["receiveddate"]) else "No date"
        
        # Print description with date
        print(f"  • [{date_str}] {row['description']}")

[32m2025-07-15 12:34:13.232[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mfill_missing_appointing_presidents[0m:[36m392[0m - [1mFilled 493 missing appointing president values using nomination dates[0m


[32m2025-07-15 12:34:13.245[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mfill_missing_party_of_appointing_presidents[0m:[36m426[0m - [1mFilled 493 missing party of appointing president values using nomination dates[0m
[32m2025-07-15 12:34:13.268[0m | [1mINFO    [0m | [36mnomination_predictor.time_aware_analysis[0m:[36mnormalize_party_codes[0m:[36m552[0m - [1mNormalized 1420 party codes in column 'party_of_appointing_president_(1)'[0m

Example nominations from each appointing president:

WILLIAM JEFFERSON CLINTON examples:
  • [January 06, 1999] stephen h. glickman, of the district of columbia, to be an associate judge of the district of columbia court of appeals for the term of fifteen years, vice john maxwell ferren, term expired.
  • [September 24, 1993] cassandra m. pulley, of the district of columbia, to be deputy administrator of the small business administration, vice paul h. cooksey, resigned.

WILLIAM J. CLINTON examples:
  •

### How much does judge's age affect approval?

Among the hypotheses is that older judges tend to get approved faster because there's not as much concern they'll live long enough to have as much of a total impact over their time in office.

In [None]:
# do we even have enough birthdays to get statistical significance or help model training?
print(f"Merged dataframe has {df['birth_day'].notna().mean()*100}% of rows with birth day")
print(f"Merged dataframe has {df['birth_month'].notna().mean()*100}% of rows with birth month")
print(f"Merged dataframe has {df['birth_year'].notna().mean()*100}% of rows with birth year")

Merged dataframe has 8.23943661971831% of rows with birth day
Merged dataframe has 8.23943661971831% of rows with birth month
Merged dataframe has 65.28169014084507% of rows with birth year


That tells me the FJC didn't offer us enough data yet to get granular down to the day.  At best we can approximate by year.  Let's pretend for simplicity that everyone was born exactly in the middle of their birth year.

In [None]:
# Age at nomination - using only birth year 

# Reference date for "future" check
yesterday = pd.Timestamp.today().normalize() - pd.Timedelta(days=1)

# Initialize column
df['age_at_nom'] = pd.NA

# Process only rows that have birth_year available
birth_year_mask = df['birth_year'].notna()

if birth_year_mask.any():
    # Since birth_year is already a datetime, just use July 1st of that year
    df['birth_date'] = pd.NaT
    
    # Extract just the year from the timestamp and create a July 1st date
    df.loc[birth_year_mask, 'birth_date'] = df.loc[birth_year_mask, 'birth_year'].apply(
        lambda ts: pd.Timestamp(year=ts.year, month=7, day=1)
    )
    
    # Filter out any future dates
    future_mask = df['birth_date'] > yesterday
    if future_mask.any():
        print(f"Warning: {future_mask.sum()} birth dates were in the future and set to NaT")
        df.loc[future_mask, 'birth_date'] = pd.NaT
    
    # Calculate age in years (as decimal)
    valid_mask = df['birth_date'].notna() & df['receiveddate'].notna()
    df.loc[valid_mask, 'age_at_nom'] = (
        (df.loc[valid_mask, 'receiveddate'] - df.loc[valid_mask, 'birth_date']).dt.days / 365.25
    ).round(1)

# Print summary of age calculation
age_count = df['age_at_nom'].notna().sum()
print(f"Successfully calculated age for {age_count} nominees ({age_count/len(df):.1%} of dataset)")
if age_count > 0:
    print(f"Age statistics: min={df['age_at_nom'].min()}, max={df['age_at_nom'].max()}, avg={df['age_at_nom'].mean():.1f}")

Successfully calculated age for 927 nominees (65.3% of dataset)
Age statistics: min=32.8, max=230.1, avg=50.7


### Creating categorical variable for seat level

In [None]:
# identify seat level
df["seat_level"] = (
    df["description"] # we do have court_type_(1), # court_type_(2), etc. columns, but for this coarse an analysis, it's simpler and accurate-enough to look through the nomination description
      .str.lower()
      .str.extract(r"(supreme|circuit|district|(?<=\s)tax|international|appeals)") # insists on whitespace before "tax" so we can tell "tax" or "taxation" etc. are a standalone word
      .fillna("other")
)

In [None]:
seat_counts = df["seat_level"].value_counts()
print(f"\nDistribution of seat levels (total {len(df)} records):")
for seat, count in seat_counts.items():
    print(f"- {seat}: {count} ({count/len(df):.1%})")

# Show some examples for each seat level for validation
print("\nExample descriptions for each seat level:")
for seat_type in df["seat_level"].unique():
    examples = df[df["seat_level"] == seat_type]["description"].sample(min(2, df[df["seat_level"] == seat_type].shape[0]))
    print(f"\n{seat_type.upper()} examples:")
    for ex in examples:
        print(f"  • {ex}")


Distribution of seat levels (total 1420 records):
- district: 1083 (76.3%)
- circuit: 231 (16.3%)
- other: 41 (2.9%)
- tax: 32 (2.3%)
- appeals: 18 (1.3%)
- international: 13 (0.9%)
- supreme: 2 (0.1%)

Example descriptions for each seat level:

TAX examples:
  • glen l. bower, of illinois, to be a judge of the united states tax court for a term of fifteen years after he takes office, vice carolyn miller parr, term expired.
  • patrick j. urda, of indiana, to be a judge of the united states tax court for a term of fifteen years, vice diane l. kroupa, retired.

DISTRICT examples:
  • malcolm j. howard, of north carolina, to be united states district judge for the eastern district of north carolina vice a new position created by p.l. 98-353, approved july 10, 1984.
  • timothy b. dyk, of the district of columbia, to be united states circuit judge for the federal circuit, vice glenn l. archer, jr., retired.

CIRCUIT examples:
  • fred i. parker, of vermont, to be united states circuit ju

## Unified vs. divided government

Ideally I'd want to use the database behind voteview to get data on how conservative/liberal/etc. the Congress and Senate were at time of nomination and confirmation.
They have an R package from https://github.com/voteview/Rvoteview which looks pretty promising as a way of getting more granular data than just which party controls each half of the legislature.
But their website has been down every time I've checked in the past week, so I gave up on that idea so far.

Lacking that, next best idea I could think to replace it with would be to populate columns for Senate & Congress party composition, at least enough to populate booleans in columns such as:


| Field Name | Data Type | Description | Example | Source |
|------------|-----------|-------------|----------|---------|
| `nom_is_unified` | Boolean | At time of nomination, President's party holds a majority in both the House and the Senate. | false | Calculated from receiveddate column |
| `nom_is_div_opp_house` | Boolean | At time of nomination, President's party holds a majority in the Senate, but the opposition controls the House. | false | Calculated from receiveddate column |
| `nom_is_div_opp_senate` | Boolean | At time of nomination, President's party holds a majority in the House, but the opposition controls the Senate. | true | Calculated from receiveddate column |
| `nom_is_fully_div` | Boolean | At time of nomination, opposition party controls both the House and the Senate. | false | Calculated from receiveddate column |
| `latestaction_is_unified` | Boolean | At time of latest action, President's party holds a majority in both the House and the Senate. | false | Calculated from latestaction_actiondate column |
| `latestaction_is_div_opp_house` | Boolean | At time of latest action, President's party holds a majority in the Senate, but the opposition controls the House. | false | Calculated from latestaction_actiondate column |
| `latestaction_is_div_opp_senate` | Boolean | At time of latest action, President's party holds a majority in the House, but the opposition controls the Senate. | true | Calculated from latestaction_actiondate column |
| `latestaction_is_fully_div` | Boolean | At time of latest action, opposition party controls both the House and the Senate. | false | Calculated from latestaction_actiondate column |

In [None]:
from nomination_predictor.congress_party_utils import add_alignment_flags

df = add_alignment_flags(df, "party_of_appointing_president_(1)", "receiveddate", "latestaction_actiondate")

## Last check for unhelpfully uninformative columns to delete

In [None]:
# Before: Show what columns we might delete
print("Columns with limited unique values:")
for col in sorted(df.columns):
    if df[col].nunique() < 2 and df[col].notna().all():
        print(f"  - {col}: {df[col].nunique()} unique value(s), 100% populated with '{df[col].iloc[0]}'")
    elif df[col].nunique() == 1 and df[col].isna().any():
        non_null_pct = df[col].notna().mean() * 100
        print(f"  - {col}: 1 unique non-null value '{df[col].dropna().iloc[0]}' ({non_null_pct:.1f}% of rows) - KEEPING")

# Drop only columns that are fully populated with the same value
dropped_cols = []
for col in sorted(df.columns):
    if df[col].nunique() < 2 and df[col].notna().all():
        print(f"Dropping from {name} - {col}: has {df[col].nunique()} unique value(s) and 0 missing values")
        df.drop(col, inplace=True, axis=1)
        dropped_cols.append(col)

print(f"\nDropped {len(dropped_cols)} columns that were fully populated with the same value")

Columns with limited unique values:
  - aba_rating_(3): 1 unique non-null value 'Well Qualified' (0.3% of rows) - KEEPING
  - court_name_(4): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - court_name_(5): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - court_name_(6): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - court_type_(4): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - court_type_(5): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - court_type_(6): 1 unique non-null value 'nan' (65.3% of rows) - KEEPING
  - degree_5: 1 unique non-null value 'Certificate in Regulatory Economics' (0.1% of rows) - KEEPING
  - degree_year_5: 1 unique non-null value '1985.0' (0.1% of rows) - KEEPING
  - isprivileged: 1 unique non-null value 'True' (0.4% of rows) - KEEPING
  - judiciary_committee_action_(2): 1 unique non-null value 'Reported (favorably)' (6.3% of rows) - KEEPING
  - nominees_0_nomineecount: 1 unique non-null value '1.0' (99

## Not-yet-implemented analyses ideas:

In [None]:
# Partisan mismatch: 1 if president_party != party__who_appointed_predecessor
# this would require an additional step of linking predecessor name to nid, and way of looking up when the judge with that nid had been in service, possibly even needing to be a date-and-location-aware analysis

#party_map = {47: "R", 46: "D", 45: "R", 44: "D", 43: "R", 42: "D", 41: "R"}  # extend list
#df["pres_party"] = df["receiveddate"].apply(lambda d: party_map.get(president_number(d), None))
#df["partisan_mismatch"] = (
#    (df["pres_party"].notna()) &
#    (df["pres_party"] != df["party_of_appointing_president"])
#)

In [None]:

# skipping this one because I think we'd get more and/or richer info out of it if we had a more-successful linkage of the unconfirmed nomination rows between congress and fjc's data
# #Count prior failed nominations for this seat_id (if column present)

#if "other_nominations_count" not in df.columns and "seat_id" in df.columns:
#    prior_counts = (
#        df.groupby("seat_id").cumcount()  # number seen so far for that seat
#    )
#    df["num_prior_failed_noms"] = prior_counts
#
#display(df.head())
#feature_engineered_df = df.copy()

## Saving interim dataframes

In [None]:
# Save to interim data
df.to_csv( INTERIM_DATA_DIR /"feature_engineered.csv", index=False)

In [None]:
# Save extracted tables to interim directory
for name, df in dfs.items():
    if len(df) > 0:  # Only save non-empty DataFrames
        output_path = INTERIM_DATA_DIR / f"{name}.csv"
        df.to_csv(output_path, index=False)
        print(f"Saved {len(df)} records to {output_path}")

Saved 4022 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_judges.csv
Saved 4720 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_federal_judicial_service.csv
Saved 4022 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_demographics.csv
Saved 8040 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_education.csv
Saved 611 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_other_federal_judicial_service.csv
Saved 828 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_other_nominations_recess.csv
Saved 19003 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/fjc_professional_career.csv
Saved 5569 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_nominations.csv
Saved 5517 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_nominees.csv
Saved 1420 records to /home/wsl2ubuntuuser/nomination_predictor/data/interim/cong_