# Judicial Vacancies Data Source Exploration

This notebook demonstrates how to use the `dataset` module to fetch and process judicial vacancy data.

## Overview

We'll:
1. Fetch and process judicial vacancy data using the dataset module
2. Save the processed data for further analysis
3. Load the data and perform exploratory data analysis (e.g. visualizations)

## Setup

In [None]:
import sys
!{sys.executable} -m pip list | grep nomination_predictor

In [None]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
from typing import Optional, Tuple, Dict, Any

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Import our data processing module
from nomination_predictor import dataset
from nomination_predictor.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Set up visualization style
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
# Verify the data directory exists and is writable
if not RAW_DATA_DIR.exists():
    print(f"Error: Data directory does not exist: {RAW_DATA_DIR}")
elif not os.access(RAW_DATA_DIR, os.W_OK):
    print(f"Error: No write permission for directory: {RAW_DATA_DIR}")
else:
    print(f"Data directory is ready: {RAW_DATA_DIR}")

## 1. Fetch and Process Data

Let's fetch the data for the range of available years and process it.

In [None]:
YEARS_BACK = 16
raw_combined_csv_path = RAW_DATA_DIR / "judicial_data.csv"
raw_vacancies_csv_path = RAW_DATA_DIR / "judicial_vacancies.csv"
raw_confirmations_csv_path = RAW_DATA_DIR / "judicial_confirmations.csv"
raw_emergencies_csv_path = RAW_DATA_DIR / "judicial_emergencies.csv"

In [None]:
def load_judicial_data(
        years_back: int = YEARS_BACK,
        force_refresh: bool = False,
        output_dir: Path = RAW_DATA_DIR,
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Load judicial data from already-saved CSV file if it is found; if not found (or if refresh forced via input arg), uses dataset module to re-retrieve from Internet into new dataframes.
        Does not attempt to save new CSV files with those dataframes; see save_judicial_data() for that.
        
        Args:
            years_back: Number of years of historical data to fetch
            force_refresh: If True, force refetching data even if files exist
            output_dir: Directory to save/load data files
            
        Returns:
            Tuple of (vacancies_df, confirmations_df, emergencies_df, combined_df)
        """
        
        try:
            # Only run the pipeline if output doesn't exist or force_refresh is True
            if force_refresh or not raw_combined_csv_path.exists():
                print("Running data pipeline...")
                with tqdm(total=4, desc="Processing data") as progress_bar:
                    # Run the main data pipeline
                    combined_df = dataset.main(
                        output_dir=output_dir,
                        output_filename=raw_combined_csv_path.name,
                        years_back=years_back
                    )
                    progress_bar.update(1)

                    
                    # Read the data with progress
                    progress_bar.set_description("Loading individual datasets")
                    combined_df = pd.read_csv(raw_combined_csv_path, sep='|') if raw_combined_csv_path.exists() else pd.DataFrame()
                    progress_bar.update(1)
                    
                    vacancies_df = pd.read_csv(raw_vacancies_csv_path, sep='|') if raw_vacancies_csv_path.exists() else pd.DataFrame()
                    progress_bar.update(1)
                    
                    confirmations_df = pd.read_csv(raw_confirmations_csv_path, sep='|') if raw_confirmations_csv_path.exists() else pd.DataFrame()
                    emergencies_df = pd.read_csv(raw_emergencies_csv_path, sep='|') if raw_emergencies_csv_path.exists() else pd.DataFrame()
                    progress_bar.update(1)
                    
            else:
                print("Loading cached data...")
                combined_df = pd.read_csv(raw_combined_csv_path, sep='|') if raw_combined_csv_path.exists() else pd.DataFrame()
                vacancies_df = pd.read_csv(raw_vacancies_csv_path, sep='|') if raw_vacancies_csv_path.exists() else pd.DataFrame()
                confirmations_df = pd.read_csv(raw_confirmations_csv_path, sep='|') if raw_confirmations_csv_path.exists() else pd.DataFrame()
                emergencies_df = pd.read_csv(raw_emergencies_csv_path, sep='|') if raw_emergencies_csv_path.exists() else pd.DataFrame()
                
            return vacancies_df, confirmations_df, emergencies_df, combined_df
            
        except Exception as e:
            print(f"Error loading data: {e}")
            return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

In [None]:
import os
from pathlib import Path

import pandas as pd

data_file = Path(raw_combined_csv_path)
data_refreshed: bool = False

# Check if file exists and is not empty
if data_file.exists() and os.path.getsize(data_file) > 0:
    print("Loading existing judicial data...")
    vacancies_df, confirmations_df, emergencies_df, combined_df = load_judicial_data(
        years_back=YEARS_BACK,
        force_refresh=False  # Use existing data
    )
else:
    print("No existing data found or file is empty. Fetching fresh data...")
    vacancies_df, confirmations_df, emergencies_df, combined_df = load_judicial_data(
        years_back=YEARS_BACK,
        force_refresh=True  # Force fetch new data
    )
    data_refreshed = True

## 2. Save raw Data

Save the raw data to a CSV file in the `data/raw` directory.

In [None]:
# Save dataframes to csv files if we got new data
from nomination_predictor.dataset import save_dataframe_to_csv

if data_refreshed and (combined_df is not None) and not combined_df.empty:
    save_dataframe_to_csv(combined_df, "judicial_data", RAW_DATA_DIR)
        
    # Save individual processed datasets
    if not vacancies_df.empty:
        save_dataframe_to_csv(vacancies_df, "judicial_vacancies", RAW_DATA_DIR)
    if not confirmations_df.empty:
        save_dataframe_to_csv(confirmations_df, "judicial_confirmations", RAW_DATA_DIR)
    if not emergencies_df.empty:
        save_dataframe_to_csv(emergencies_df, "judicial_emergencies", RAW_DATA_DIR)

## 3. Load either Saved or Internet-Retrieved Data

Let's verify that we can load the saved data.

In [None]:
# Load the data\n",
print("Loading judicial data...")
vacancies_df, confirmations_df, emergencies_df, combined_df = load_judicial_data(
    years_back=YEARS_BACK,
    force_refresh=False  # Set to True to refetch data
)

In [None]:
# Display data summary
def display_data_summary():
    """Display summary of loaded data."""
    print("\nData Summary:")
    print(f"Vacancies: {len(vacancies_df)} records")
    print(f"Confirmations: {len(confirmations_df)} records")
    print(f"Emergencies: {len(emergencies_df)} records")
    print(f"Combined: {len(combined_df)} total records")
    
    if not combined_df.empty:
        print("\nDate ranges:")
        for col in ['vacancy_date', 'nomination_date', 'confirmation_date']:
            if col in combined_df.columns:
                dates = pd.to_datetime(combined_df[col], errors='coerce')
                valid_dates = dates[dates.notna()]
                if not valid_dates.empty:
                    print(f"{col}: {valid_dates.min().date()} to {valid_dates.max().date()}")

display_data_summary()

In [None]:
# trim & normalize string column names (leave upper/lower casing of string values as-is; only modifying column titles)
for df in (vacancies_df, confirmations_df, emergencies_df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    str_cols = df.select_dtypes(include=['object']).columns

### Coverting dates to datetime format

It'll be easier to perform data analysis if we we'll have to convert dates from string to datetime format. in order for that to work, we'll need to clean up one item clearly-incorrect in the source data: despite the Vacancy Confirmations report from 2015 January showing Claudia Ann Wilken assumed senior status in January 1st of the year 3000, according to https://www.fjc.gov/node/1389756 it really happened on December 17th, 2014 instead.  We'll correct for that in order for pd.to_datetime() to not choke on a far-future year...

In [None]:
confirmations_df.loc[confirmations_df['incumbent'] == 'Wilken,Claudia', 'vacancy_date'] = '12/17/2014'

Parse dates to convert any yyyy/mm/dd, mm/dd/yyyy, or empty string to datetime64[ns]:

In [None]:
for df in (vacancies_df, confirmations_df, emergencies_df):
    if df is not None and not df.empty:
        df['vacancy_date'] = pd.to_datetime(df['vacancy_date'], errors='coerce')

if vacancies_df is not None and not vacancies_df.empty:
    vacancies_df['nomination_date'] = pd.to_datetime(vacancies_df['nomination_date'], errors='coerce')
        
if confirmations_df is not None and not confirmations_df.empty: 
    confirmations_df['confirmation_date'] = pd.to_datetime(confirmations_df['confirmation_date'], errors='coerce')

Split the “circuit-court” code into two columns (circuit, district_or_state) so “02 – NYS” and “02 – NYW” group sensibly:

In [None]:
from nomination_predictor.dataset import parse_circuit_court


# Function to safely parse circuit and court
def add_circuit_court_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add 'circuit' and 'court_code' columns to the dataframe by parsing the source column.
    
    Args:
        df: Input DataFrame
        source_col: Name of the column containing the court string to parse
        
    Returns:
        DataFrame with added 'circuit' and 'court_code' columns
    """
    # Initialize new columns
    df['circuit'] = None
    df['court_code'] = None
    
    # Parse each court string and fill the new columns
    for idx, court_str in df['circuit_district'].items():
        try:
            circuit, court_code = parse_circuit_court(str(court_str))
            df.at[idx, 'circuit'] = circuit
            df.at[idx, 'court_code'] = court_code
        except ValueError as e:
            print(f"Warning: Could not parse court string '{court_str}': {e}")
            df.at[idx, 'circuit'] = None
            df.at[idx, 'court_code'] = court_str  # Keep original as fallback
    
    return df

In [None]:
# rename the differently-named, same-purpose column from vacancies tables to make looping easier:
if not vacancies_df.empty and 'court' in vacancies_df.columns:
    vacancies_df = vacancies_df.rename(columns={'court': 'circuit_district'})

for df_name, df in [('vacancies_df', vacancies_df), ('confirmations_df', confirmations_df), ('emergencies_df', emergencies_df)]:
    if not df.empty:
        print(f"\nAdding circuit/court columns to {df_name}")
        df = add_circuit_court_columns(df)
        display(df[['circuit_district', 'circuit', 'court_code']].head())

## Data cleaning

Let's start with any hand-checked corrections to incorrectly-written reports.  (Similar to what had to be done to the record about the year 3000 above, which was a prerequisite to us cleaning up datetime formats.)

In [None]:
# Delete a record for which the subsequent month's report corrected its info.  Incorrect record is easy to identify by its unexpected parenthetical numbers in a name field.
# Subsequent month's report's info has been verified by comparing it with https://www.ca2.uscourts.gov/judges/bios/gel.html and https://www.senate.gov/legislative/LIS/roll_call_votes/vote1111/vote_111_1_00288.htm
confirmations_df = confirmations_df.drop(confirmations_df[confirmations_df['nominee'] == "Lynch,Gerard E. (103419)"].index)

### Creating a *seat-vacancy key* for ourselves

A vacancy can be uniquely defined by a consistent combination of circuit, court district, incumbent (or in the case of newly-opened roles, the lack of an incumbent), and vacancy date.  Over time it typically gets a nominee, then gets a confirmation. But those are a matter of its status or progress over time; the vacancy ID we'd give it would stay the same across months to indicate it's the same vacancy.

The seat_id can end up looking something like:
`seat_id:str = (circuit|court_code|incumbent|vacancy_date).casefold()`

In [None]:
# after loading the emergencies dataframe, standardize the column name to be same as the way the other dataframes use that info
if not emergencies_df.empty and 'vacancy_created_by' in emergencies_df.columns:
    emergencies_df = emergencies_df.rename(columns={'vacancy_created_by': 'incumbent'})

# standardizing incumbent name column makes 
def make_key(df) -> str:
    # Use the new circuit and court_code columns instead of circuit_district because we just engineered the former pair to be more consistent across dataframes
    circuit_info = df['circuit'].astype(str) + '-' + df['court_code'].fillna('')
    return (
        circuit_info
        + "|" + df['incumbent'].fillna("POSITION OPEN")
        + "|" + df['vacancy_date'].dt.strftime("%Y-%m-%d")
    ).str.casefold()

vacancies_df['seat_id'] = make_key(vacancies_df)
vacancies_df['seat_id']  = make_key(vacancies_df)
emergencies_df['seat_id'] = make_key(emergencies_df)
confirmations_df['seat_id'] = make_key(confirmations_df)

This lets us de-duplicate monthly re-listings, keeping the most-recent because it's more likely to contain a nominee instead of leave that field blank & to-be-determined:

In [None]:
def keep_latest_with_index(df, name="dataset"):
    """Keep only the most recent entry for each seat_id and set it as index.
    
    Args:
        df: Input DataFrame with source_year, source_month, and seat_id columns
        name: Name of the dataset for logging purposes
        
    Returns:
        DataFrame with seat_id as index and most recent entries
    """
    if df is None or df.empty:
        print(f"Skipping {name} - not found or empty")
        return None
    
    # Store original count for later reporting of how many dupes we dropped
    original_count = len(df)
    
    # Create temporary date column safely
    df = df.copy()
    df['last_report_date'] = pd.to_datetime({
        'year':  df['source_year'],
        'month': df['source_month'],
        'day':   1 # for our purposes we know we only get 1 report per type per month, so day=1 is a good-enough simplification even though technically false about our end-of-month reports
    })
    
    # Get most recent entry per seat_id and set index
    latest = (df
             .sort_values(['seat_id', 'last_report_date'], ascending=[True, False])
             .drop_duplicates('seat_id', keep='first')
             .set_index('seat_id', drop=False))  # Keep seat_id as both index and column
    
    # Report results
    print(f"{name}: {len(latest)} unique seats (removed {original_count - len(latest)} duplicates)")
    return latest

# Process dataframes with seat_id as index
dfs = {
    'vacancies': keep_latest_with_index(vacancies_df, "Vacancies"),
    'confirmations': keep_latest_with_index(confirmations_df, "Confirmations"),
    'emergencies': keep_latest_with_index(emergencies_df, "Emergencies")
}

## Seat-Id-aware combining into merged dataset

In [None]:
for df in [vacancies_df, confirmations_df, emergencies_df]:
    print(df.columns) 

In [None]:
def combine_dataframes(confirmations_df, emergencies_df, vacancies_df):
    """Combine dataframes with confirmations having highest priority, then emergencies, then vacancies.
    Only non-null values will overwrite existing values.
    """
    # Start with the lowest priority (vacancies)
    combined = vacancies_df.copy()
    
    # Function to safely update a series with non-null values
    def safe_update(original, new):
        if pd.isna(new) or new is None:
            return original
        return new
    
    # Process emergencies first (medium priority)
    if not emergencies_df.empty:
        emergencies_dict = emergencies_df.set_index('seat_id').to_dict('index')
        for seat_id, row in emergencies_dict.items():
            if seat_id in combined['seat_id'].values:
                # Update only non-null values
                for col, value in row.items():
                    if col in combined.columns and not pd.isna(value) and value is not None:
                        combined.loc[combined['seat_id'] == seat_id, col] = value
            else:
                # Add new row if seat_id doesn't exist
                combined = pd.concat([combined, pd.DataFrame([row])], ignore_index=True)
    
    # Then process confirmations (highest priority)
    if not confirmations_df.empty:
        confirmations_dict = confirmations_df.set_index('seat_id').to_dict('index')
        for seat_id, row in confirmations_dict.items():
            if seat_id in combined['seat_id'].values:
                # Update only non-null values
                for col, value in row.items():
                    if col in combined.columns and not pd.isna(value) and value is not None:
                        combined.loc[combined['seat_id'] == seat_id, col] = value
            else:
                # Add new row if seat_id doesn't exist
                combined = pd.concat([combined, pd.DataFrame([row])], ignore_index=True)
    
    return combined

# Usage:
combined_df = combine_dataframes(
    confirmations_df=dfs['confirmations'],
    emergencies_df=dfs['emergencies'],
    vacancies_df=dfs['vacancies']
)

# Drop unwanted columns
columns_to_drop = ['circuit_district', 'source_year', 'source_month', 'source_page_type']
combined_df = combined_df.drop(
    columns=[col for col in columns_to_drop if col in combined_df.columns]
)

In [None]:
if combined_df is not None:
    print("\nSample data from combined dataset:")
    display(combined_df.head())
    
    # Basic statistics
    print("\nBasic Statistics:")
    
    if 'vacancy_date' in combined_df.columns:
        print("\nDate Range:")
        # Convert from string to datetime so that comparisons actually compare
        combined_df['vacancy_date_dt'] = pd.to_datetime(combined_df['vacancy_date'], format='%m/%d/%Y')
        print(f"Earliest vacancy: {combined_df['vacancy_date_dt'].min().strftime('%m/%d/%Y')}")
        print(f"Latest vacancy: {combined_df['vacancy_date_dt'].max().strftime('%m/%d/%Y')}")
    
    if 'circuit_district' in combined_df.columns:
        print("\nRecords by Circuit/District:")
        print(combined_df['circuit_district'].value_counts().head(10))
        

# Features Engineering

Add fields which will become our regression targets.

TODO: supplement these with additional "elapsing only while the legisladtive branch was in session" durations.

In [None]:
print(combined_df.columns)

In [None]:
import numpy as np


def safe_date_diff(end_date, start_date):
    """Calculate days between dates, returning None if either date is missing"""
    if pd.isna(end_date) or pd.isna(start_date):
        return np.nan
    try:
        return (end_date - start_date).days
    except (TypeError, ValueError):
        return np.nan

# Calculate day differences safely
combined_df['days_vac_to_nom'] = combined_df.apply(
    lambda x: safe_date_diff(x['nomination_date'], x['vacancy_date']), 
    axis=1
)

combined_df['days_nom_to_conf'] = combined_df.apply(
    lambda x: safe_date_diff(x['confirmation_date'], x['nomination_date']), 
    axis=1
)

combined_df['days_vac_to_conf'] = combined_df.apply(
    lambda x: safe_date_diff(x['confirmation_date'], x['vacancy_date']), 
    axis=1
)

Add fields to indicate who was the sitting president on key dates, which presidential term and # of days into that presidential term.

In [None]:
# TODO: implement test cases and function in features.py such that it can take a datetime-type input, and returns an int indicating which president was sitting in office at the time
# TODO: call that function from here to add fields for each row indicating who was president at times of vacancy, nomination, and confirmation

Add field to indicate which legislative session key dates occurred in

In [None]:
# TODO: implement test cases and function in features.py such that it can take a datetime-type input, and returns an int to indicate which legislative session it falls under.
# TODO: decide how to handle in-between-sessions dates
# TODO: call that function from here to add fields

Add fields for how much government is/isn't unified

Add field to group by U.S. states because many have multiple regional districts (e.g. CA-N, CA-S)

# Exploratory Data Analysis

In [None]:
def explore_data(df: pd.DataFrame, dataset_name: str) -> None:
    """
    Generate exploratory visualizations for a dataset.
    
    Args:
        df: DataFrame to explore
        dataset_name: Name of the dataset for titles
    """
    if df.empty:
        print(f"No data available for {dataset_name}")
        return
    
    print(f"\nExploring {dataset_name} data:")
    
    # Display basic info
    print("\nFirst few records:")
    display(df.head())
    
    # Plot time series if date columns exist
    date_columns = [col for col in df.columns if 'date' in col.lower()]
    for date_col in date_columns:
        if date_col in df.columns and not df[date_col].isna().all():
            plt.figure(figsize=(12, 6))
            df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
            time_series = df[date_col].dropna().value_counts().sort_index()
            time_series.plot(kind='line', marker='o')
            plt.title(f"{dataset_name} by {date_col}")
            plt.xlabel(date_col)
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
    
    # Plot categorical data
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() < 20]
    for col in categorical_cols:
        plt.figure(figsize=(10, 6))
        df[col].value_counts().sort_values(ascending=False).plot(kind='bar')
        plt.title(f"{col} Distribution")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        time_series = df[date_col].dropna().value_counts().sort_index()
        time_series.plot(kind='line', marker='o')
        plt.title(f"{dataset_name} by {date_col}")
        plt.xlabel(date_col)
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    # Plot categorical data
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() < 20]
    for col in categorical_cols:
        plt.figure(figsize=(10, 6))
        df[col].value_counts().sort_values(ascending=False).plot(kind='bar')
        plt.title(f"{col} Distribution")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# Explore each dataset
if not vacancies_df.empty:
    explore_data(vacancies_df, "Vacancies")
    
if not confirmations_df.empty:
    explore_data(confirmations_df, "Confirmations")
    
if not emergencies_df.empty:
    explore_data(emergencies_df, "Emergencies")

In [None]:
if not combined_df.empty:
    print("\nCombined Data Analysis:")
    
    # Convert date columns
    date_columns = [col for col in combined_df.columns if 'date' in col.lower()]
    for col in date_columns:
        combined_df[col] = pd.to_datetime(combined_df[col], errors='coerce')
    
    # Plot vacancy reasons
    if 'vacancy_reason' in combined_df.columns:
        plt.figure(figsize=(12, 6))
        combined_df['vacancy_reason'].value_counts().plot(kind='pie', autopct='%1.1f%%')
        plt.title("Vacancy Reasons")
        plt.ylabel("")
        plt.show()
    
    # Plot vacancies over time
    if 'vacancy_date' in combined_df.columns:
        plt.figure(figsize=(14, 6))
        combined_df.set_index('vacancy_date').resample('M').size().plot()
        plt.title("Monthly Vacancies Over Time")
        plt.xlabel("Date")
        plt.ylabel("Number of Vacancies")
        plt.tight_layout()
        plt.show()

## Next Steps

1. **Data Cleaning**: In the next notebook, we'll clean and preprocess this data.
2. **Exploratory Analysis**: We'll explore the data to understand its structure and quality.
3. **Feature Engineering**: We'll create additional features that might be useful for analysis.
4. **Visualization**: We'll create visualizations to understand trends and patterns.

In [None]:
# Cell 1: Setup and Imports
# Set up logging
import logging
from pathlib import Path
from typing import Any, Dict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display

from nomination_predictor.congress_api import CongressAPIClient
# Import the function from your module
from nomination_predictor.dataset import compare_and_validate_api_data

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)
sns.set_theme(style="whitegrid")

In [None]:
# Cell 2: Fetch Data from Congress.gov API
def fetch_api_data() -> pd.DataFrame:
    """Fetch data from Congress.gov API and return as a DataFrame.
    
    Args:
        congresses_back: Number of past congresses to include (default: 5)
        
    Returns:
        pd.DataFrame: DataFrame containing judicial nomination data
    """
    try:
        client = CongressAPIClient()
        current_congress = 118  # TODO: expand to look through prior congresses after proof of concept with just one
        logger.info(f"Fetching data for congress {current_congress}...")
        
        # Get list of nomination records
        nominations = client.get_judicial_nominations(current_congress)
        
        # Convert list of dicts to DataFrame
        if nominations:  # Only create DataFrame if we have data
            return pd.DataFrame(nominations)
        else:
            logger.error("No data retrieved from API")
            return pd.DataFrame()  # Return empty DataFrame if no data
        
    except Exception as e:
        logger.error(f"Error fetching API data: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

# Fetch the data
api_df = fetch_api_data()
print(api_df.head())

In [None]:

# Cell 3: Run the Comparison
def run_comparison_analysis(api_df: pd.DataFrame) -> Dict[str, Any]:
    """Run the comparison and display results."""
    if api_df.empty:
        return {"error": "No API data available for comparison"}
    
    # Run the comparison
    results = compare_and_validate_api_data(api_df)
    
    # Display summary
    display(Markdown("## 📊 Schema Comparison Summary"))
    display(pd.DataFrame({
        "Metric": [
            "API Records", 
            "Legacy Vacancy Records", 
            "Legacy Confirmation Records",
            "Vacancy Schema Coverage",
            "Confirmation Schema Coverage",
            "Overall Compatibility Score"
        ],
        "Value": [
            results["api_record_count"],
            results.get("vacancy_record_count", "N/A"),
            results.get("confirmation_record_count", "N/A"),
            f"{results['schema_comparison']['vacancy'].get('coverage_percentage', 0):.1f}%",
            f"{results['schema_comparison']['confirmation'].get('coverage_percentage', 0):.1f}%",
            f"{results.get('compatibility_score', 0):.1f}%"
        ]
    }))
    
    # Display critical fields analysis
    display(Markdown("## 🔍 Critical Fields Analysis"))
    critical_data = []
    for source in ["vacancy", "confirmation"]:
        if f"missing_critical_{source}_fields" in results:
            for field in results[f"missing_critical_{source}_fields"] or []:
                critical_data.append({
                    "Source": source.capitalize(),
                    "Critical Field": field,
                    "Status": "❌ Missing"
                })
    
    if critical_data:
        display(pd.DataFrame(critical_data))
    else:
        display(Markdown("✅ All critical fields are present in the API data"))
    
    # Data quality analysis
    display(Markdown("## 📈 Data Quality Analysis"))
    if results.get("data_quality"):
        null_df = pd.DataFrame(
            results["data_quality"]["null_percentage"].items(),
            columns=["Field", "Null Percentage"]
        ).sort_values("Null Percentage", ascending=False)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(data=null_df.head(10), x="Null Percentage", y="Field")
        plt.title("Top 10 Fields by Null Percentage")
        plt.tight_layout()
        plt.show()
        
        if results["data_quality"]["high_null_fields"]:
            display(Markdown(f"⚠️ High null percentage (>50%) in fields: {', '.join(results['data_quality']['high_null_fields'])}"))
    
    return results

# Run the analysis
results = run_comparison_analysis(api_df)

In [None]:

# Cell 4: Detailed Field Comparison (Optional)
def display_field_comparison(results: Dict[str, Any]) -> None:
    """Display detailed field comparison between schemas."""
    display(Markdown("## 🔄 Detailed Field Comparison"))
    
    for source in ["vacancy", "confirmation"]:
        if source in results["schema_comparison"]:
            comp = results["schema_comparison"][source]
            display(Markdown(f"### {source.capitalize()} Schema"))
            
            # Common fields
            display(Markdown("#### ✅ Common Fields"))
            display(pd.DataFrame(comp["common_fields"], columns=["Field"]))
            
            # Missing fields
            if comp["missing_from_api"]:
                display(Markdown("#### ❌ Fields Missing from API"))
                display(pd.DataFrame(comp["missing_from_api"], columns=["Field"]))
            
            # Extra fields
            if comp["extra_in_api"]:
                display(Markdown("#### ➕ Extra Fields in API"))
                display(pd.DataFrame(comp["extra_in_api"], columns=["Field"]))

# Uncomment to see detailed field comparison
display_field_comparison(results)