# Setup and Configuration

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime, timedelta

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Environment checks
print(f"Python executable: {sys.executable}")  
print("Current working directory:", os.getcwd())
print("\nFiles in data directory:", os.listdir('data'))

# Helper Functions and Classes

In [None]:
# Helper Functions for Data Loading and Cleaning
# update: now takes varied date ranges into account in loading
def load_and_clean_shifts(df):
    """
    Load and clean shifts dataset
    
    Parameters:
        df (pd.DataFrame): Raw shifts dataframe
        
    Returns:
        pd.DataFrame: Cleaned shifts dataframe with proper datatypes
        
    Notes:
        - Makes a copy to avoid modifying original data
        - Converts datetime columns
        - Handles potential errors in datetime conversion
    """
    df = df.copy()
    
    # Convert datetime columns with error handling
    datetime_cols = ['Start', 'End', 'Created At']
    for col in datetime_cols:
        if col in df.columns:
            try:
                df[col] = pd.to_datetime(df[col], format='mixed')
            except Exception as e:
                print(f"Error converting {col} to datetime: {str(e)}")
                # Log problematic rows for investigation
                problematic_rows = df[pd.to_datetime(df[col], format='mixed', errors='coerce').isna()]
                if not problematic_rows.empty:
                    print(f"Problematic rows in {col}:")
                    print(problematic_rows[col].head())
    
    return df

def load_and_clean_bookings(df):
    """
    Load and clean booking logs dataset
    
    Parameters:
        df (pd.DataFrame): Raw bookings dataframe
        
    Returns:
        pd.DataFrame: Cleaned bookings dataframe with proper datatypes
    """
    df = df.copy()
    # Convert datetime columns
    try:
        df['Created At'] = pd.to_datetime(df['Created At'])
    except Exception as e:
        print(f"Error converting Created At: {str(e)}")
    return df

def load_and_clean_cancellations(df):
    """
    Load and clean cancellation logs dataset
    
    Parameters:
        df (pd.DataFrame): Raw cancellations dataframe
        
    Returns:
        pd.DataFrame: Cleaned cancellations dataframe with proper datatypes
    """
    df = df.copy()
    # Convert datetime columns with flexible parsing
    try:
        df['Created At'] = pd.to_datetime(df['Created At'], format='mixed')
        df['Shift Start Logs'] = pd.to_datetime(df['Shift Start Logs'], format='mixed')
    except Exception as e:
        print(f"Error in datetime conversion: {str(e)}")
        # Try to identify problematic rows
        prob_rows = df[pd.to_datetime(df['Shift Start Logs'], format='mixed', errors='coerce').isna()]
        if not prob_rows.empty:
            print("\nSample of problematic date formats:")
            print(prob_rows['Shift Start Logs'].head())
    
    return df

def categorize_lead_time(hours):
    """
    Categorize lead times based on business rules.
    
    Parameters:
        hours (float): Lead time in hours
        
    Returns:
        str: Category of lead time
    """
    if hours < 0:
        return 'No-Show'  # Cancelled after shift start
    elif hours < 4:
        return 'Late (<4hrs)'
    elif hours < 24:
        return 'Same Day'
    elif hours < 72:
        return 'Advance (<3 days)'
    return 'Early (3+ days)'

def clean_lead_times(cancellations_df):
    """
    Clean and categorize lead times in cancellation data
    
    Parameters:
        cancellations_df (pd.DataFrame): Raw cancellations dataframe
    
    Returns:
        pd.DataFrame: Cleaned cancellations data with categorized lead times
        pd.Series: Statistics about removed records for quality control
    """
    df = cancellations_df.copy()
    
    # Track data quality issues
    quality_stats = {
        'original_rows': len(df),
        'null_lead_times': df['Lead Time'].isnull().sum(),
        'infinite_values': (~np.isfinite(df['Lead Time'])).sum()
    }
    
    # Only remove truly invalid data
    mask = df['Lead Time'].notnull() & np.isfinite(df['Lead Time'])
    df = df[mask]
    
    # Add cleaned lead time without filtering extremes
    df['clean_lead_time'] = df['Lead Time']
    
    # Categorize all lead times
    df['cancellation_category'] = df['clean_lead_time'].apply(categorize_lead_time)
    
    # Add flags for extreme values for analysis
    df['is_extreme_negative'] = df['Lead Time'] < -72  # Flag cancellations >3 days after
    df['is_extreme_positive'] = df['Lead Time'] > 1000 # Flag cancellations >41 days before
    
    quality_stats['final_rows'] = len(df)
    quality_stats['removed_rows'] = quality_stats['original_rows'] - quality_stats['final_rows']
    
    return df, pd.Series(quality_stats)

# Data Summary Storage Class
class DataSummary:
    """Class to store and manage analysis results"""
    def __init__(self):
        self.summaries = {}
    
    def add_summary(self, dataset_name, summary_type, data):
        """Add summary statistics to storage"""
        if dataset_name not in self.summaries:
            self.summaries[dataset_name] = {}
        self.summaries[dataset_name][summary_type] = data
    
    def get_summary(self, dataset_name, summary_type=None):
        """Retrieve stored summary statistics"""
        if summary_type:
            return self.summaries.get(dataset_name, {}).get(summary_type)
        return self.summaries.get(dataset_name)
    
    def print_summary(self, dataset_name):
        """Print stored summaries for a dataset"""
        if dataset_name in self.summaries:
            print(f"\nSummary for {dataset_name}:")
            for summary_type, data in self.summaries[dataset_name].items():
                print(f"\n{summary_type}:")
                print(data)

# Initialize summary storage
summary = DataSummary()

In [None]:
# Summary explorer helper functions 
def explore_summary(summary, indent=0, max_display_length=100):
    """
    Recursively explore and display the contents of the DataSummary object.
    
    Parameters:
        summary: The DataSummary object or a nested dictionary/value to explore
        indent: Current indentation level (default: 0)
        max_display_length: Maximum length for displayed values (default: 100)
    """
    def format_value(value):
        """Format a value for display, truncating if too long"""
        str_value = str(value)
        if len(str_value) > max_display_length:
            return str_value[:max_display_length] + '...'
        return str_value
    
    def print_indented(text, indent):
        """Print text with proper indentation"""
        print('    ' * indent + text)

    if isinstance(summary, DataSummary):
        # If we're starting with a DataSummary object, explore its summaries
        print("\n=== Complete Summary Contents ===\n")
        explore_summary(summary.summaries, indent)
    
    elif isinstance(summary, dict):
        # Recursively explore dictionary contents
        for key, value in summary.items():
            if isinstance(value, dict):
                print_indented(f"{key}:", indent)
                explore_summary(value, indent + 1)
            elif isinstance(value, pd.Series) or isinstance(value, pd.DataFrame):
                print_indented(f"{key}: [pandas {type(value).__name__}]", indent)
                str_repr = str(value)
                for line in str_repr.split('\n')[:5]:  # Show first 5 lines
                    print_indented(line, indent + 1)
                if len(str_repr.split('\n')) > 5:
                    print_indented('...', indent + 1)
            else:
                print_indented(f"{key}: {format_value(value)}", indent)
    
    else:
        # Base case: print the value
        print_indented(format_value(summary), indent)

def get_summary_structure(summary):
    """
    Print just the structure of the summary without all the data values.
    
    Parameters:
        summary: The DataSummary object
    """
    print("\n=== Summary Structure ===\n")
    for dataset_name in summary.summaries:
        print(f"\nDataset: {dataset_name}")
        for summary_type in summary.summaries[dataset_name]:
            print(f"  └── {summary_type}")

# Example usage:
print("First, let's see the overall structure of your summary:")
get_summary_structure(summary)

print("\nWould you like to see the complete contents of any specific dataset?")
print("You can view them using:")
print("explore_summary(summary.get_summary('dataset_name'))")

# To view everything:
print("\nOr view all contents with:")
print("explore_summary(summary)")

# Initial Data Loading and Validation

In [None]:
# === Load and Prepare All Datasets ===
print("Loading and preparing all datasets...")

# Load all datasets
shifts_df = pd.read_csv('data/cleveland_shifts_large.csv')
bookings_df = pd.read_csv('data/booking_logs_large.csv')
cancellations_df = pd.read_csv('data/cancel_logs_large.csv')

def get_overlapping_date_range(shifts_df, bookings_df, cancellations_df):
    """
    Determine the overlapping date range across all three datasets.
    Returns the start and end dates that represent the period where we have complete data.
    
    The overlapping range is determined by:
    - Latest start date among all datasets (to ensure we have data from all sources)
    - Earliest end date among all datasets (to ensure we don't exceed any dataset's range)
    """
    # Get date ranges for each dataset
    shifts_range = {
        'start': shifts_df['Created At'].min(),
        'end': shifts_df['Created At'].max()
    }
    bookings_range = {
        'start': bookings_df['Created At'].min(),
        'end': bookings_df['Created At'].max()
    }
    cancellations_range = {
        'start': cancellations_df['Created At'].min(),
        'end': cancellations_df['Created At'].max()
    }
    
    # Find overlapping range
    overlap_start = max(
        shifts_range['start'],
        bookings_range['start'],
        cancellations_range['start']
    )
    
    overlap_end = min(
        shifts_range['end'],
        bookings_range['end'],
        cancellations_range['end']
    )
    
    return overlap_start, overlap_end

def filter_to_overlap_period(df, start_date, end_date):
    """
    Filter a dataframe to only include rows within the overlapping date range.
    """
    return df[
        (df['Created At'] >= start_date) & 
        (df['Created At'] <= end_date)
    ]

# After your existing data loading code, add:
# Find overlapping period
overlap_start, overlap_end = get_overlapping_date_range(shifts_df, bookings_df, cancellations_df)

# Filter all datasets to overlapping period
shifts_df = filter_to_overlap_period(shifts_df, overlap_start, overlap_end)
bookings_df = filter_to_overlap_period(bookings_df, overlap_start, overlap_end)
cancellations_df = filter_to_overlap_period(cancellations_df, overlap_start, overlap_end)

# Print information about the filtering
print("\n=== Data Filtering Summary ===")
print(f"Analysis Period: {overlap_start} to {overlap_end}")
print("\nDataset sizes after filtering:")
print(f"Shifts: {len(shifts_df):,} records")
print(f"Bookings: {len(bookings_df):,} records")
print(f"Cancellations: {len(cancellations_df):,} records")

# Store filtering info in summary
summary.add_summary('data_filtering', 'overlap_period', {
    'start': overlap_start,
    'end': overlap_end,
    'original_sizes': {
        'shifts': len(shifts_df),
        'bookings': len(bookings_df),
        'cancellations': len(cancellations_df)
    }
})

In [None]:
# Clean and prepare the data
shifts_df = load_and_clean_shifts(shifts_df)
bookings_df = load_and_clean_bookings(bookings_df)
cancellations_df = load_and_clean_cancellations(cancellations_df)

# Function to analyze and summarize a dataset
def analyze_dataset(df, dataset_name, summary):
    print(f"\n=== {dataset_name} Data Overview ===")
    print("Dataset Shape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nData Types:\n", df.dtypes)
    
    # Missing value analysis
    missing_values = df.isnull().sum()
    print("\nMissing Values:\n", missing_values)
    
    # Display sample data
    print("\nFirst few rows:")
    print(df.head())
    
    # Store findings in summary
    summary.add_summary(dataset_name.lower(), 'shape', df.shape)
    summary.add_summary(dataset_name.lower(), 'dtypes', df.dtypes)
    summary.add_summary(dataset_name.lower(), 'missing_values', missing_values)
    
    # Additional temporal analysis
    date_range = {
        'start_date': pd.to_datetime(df['Created At']).min(),
        'end_date': pd.to_datetime(df['Created At']).max(),
        'total_days': (pd.to_datetime(df['Created At']).max() - pd.to_datetime(df['Created At']).min()).days
    }
    summary.add_summary(dataset_name.lower(), 'date_range', date_range)
    
    # Record unique IDs count
    unique_ids = df['ID'].nunique()
    summary.add_summary(dataset_name.lower(), 'unique_ids', unique_ids)
    
    return date_range, unique_ids

# Analyze each dataset
datasets = {
    'Shifts': shifts_df,
    'Bookings': bookings_df,
    'Cancellations': cancellations_df
}

print("\n=== Dataset Analysis ===")
for name, df in datasets.items():
    date_range, unique_ids = analyze_dataset(df, name, summary)
    print(f"\n{name} Dataset Summary:")
    print(f"Date Range: {date_range['start_date']} to {date_range['end_date']} ({date_range['total_days']} days)")
    print(f"Unique IDs: {unique_ids}")

# Cross-dataset validation
print("\n=== Cross-Dataset Validation ===")
for name, df in datasets.items():
    print(f"\n{name} Dataset:")
    print(f"Total Records: {len(df)}")
    print(f"Records per Day: {len(df) / (pd.to_datetime(df['Created At']).max() - pd.to_datetime(df['Created At']).min()).days:.2f}")

# Check for overlapping IDs between datasets
print("\n=== ID Overlap Analysis ===")
shifts_ids = set(shifts_df['ID'])
bookings_ids = set(bookings_df['ID'])
cancellations_ids = set(cancellations_df['ID'])

overlap_analysis = {
    'shifts_bookings': len(shifts_ids.intersection(bookings_ids)),
    'shifts_cancellations': len(shifts_ids.intersection(cancellations_ids)),
    'bookings_cancellations': len(bookings_ids.intersection(cancellations_ids))
}

summary.add_summary('cross_validation', 'id_overlaps', overlap_analysis)

print("ID Overlaps:")
print(f"Shifts-Bookings: {overlap_analysis['shifts_bookings']}")
print(f"Shifts-Cancellations: {overlap_analysis['shifts_cancellations']}")
print(f"Bookings-Cancellations: {overlap_analysis['bookings_cancellations']}")

In [None]:
# note look here 
# best and worse facilities and hcps 
# to do check this, doesn't seem right 
# once checked, add to summary 
import pandas as pd
import numpy as np

def analyze_hcp_patterns(shifts_df, bookings_df, cancellations_df):
    """
    Analyze HCP patterns across all datasets
    """
    # Count HCP appearances in each dataset
    shifts_hcp_counts = shifts_df['Agent ID'].value_counts()
    bookings_hcp_counts = bookings_df['Worker ID'].value_counts()
    cancellations_hcp_counts = cancellations_df['Worker ID'].value_counts()
    
    # Get unique HCP counts
    unique_hcps = {
        'shifts': shifts_df['Agent ID'].nunique(),
        'bookings': bookings_df['Worker ID'].nunique(),
        'cancellations': cancellations_df['Worker ID'].nunique()
    }
    
    # Find overlapping HCPs
    shifts_hcps = set(shifts_df['Agent ID'].dropna())
    bookings_hcps = set(bookings_df['Worker ID'].dropna())
    cancellations_hcps = set(cancellations_df['Worker ID'].dropna())
    
    hcp_overlaps = {
        'shifts_bookings': len(shifts_hcps.intersection(bookings_hcps)),
        'shifts_cancellations': len(shifts_hcps.intersection(cancellations_hcps)),
        'bookings_cancellations': len(bookings_hcps.intersection(cancellations_hcps))
    }
    
    return {
        'hcp_counts': {
            'shifts': shifts_hcp_counts,
            'bookings': bookings_hcp_counts,
            'cancellations': cancellations_hcp_counts
        },
        'unique_counts': unique_hcps,
        'overlaps': hcp_overlaps
    }

def analyze_facility_patterns(shifts_df, bookings_df, cancellations_df):
    """
    Analyze facility patterns across all datasets
    """
    # Count shifts per facility
    facility_shift_counts = shifts_df['Facility ID'].value_counts()
    
    # Count bookings per facility
    facility_booking_counts = bookings_df['Facility ID'].value_counts()
    
    # Count cancellations per facility
    facility_cancel_counts = cancellations_df['Facility ID'].value_counts()
    
    # Calculate cancellation rates per facility
    facility_stats = pd.DataFrame({
        'total_shifts': facility_shift_counts,
        'total_bookings': facility_booking_counts.reindex(facility_shift_counts.index).fillna(0),
        'total_cancellations': facility_cancel_counts.reindex(facility_shift_counts.index).fillna(0)
    })
    
    facility_stats['cancellation_rate'] = (
        facility_stats['total_cancellations'] / facility_stats['total_bookings']
    ).fillna(0)
    
    # Get unique facility counts
    unique_facilities = {
        'shifts': shifts_df['Facility ID'].nunique(),
        'bookings': bookings_df['Facility ID'].nunique(),
        'cancellations': cancellations_df['Facility ID'].nunique()
    }
    
    return {
        'facility_stats': facility_stats,
        'unique_counts': unique_facilities
    }

# Run the analysis
hcp_analysis = analyze_hcp_patterns(shifts_df, bookings_df, cancellations_df)
facility_analysis = analyze_facility_patterns(shifts_df, bookings_df, cancellations_df)

# Print summary statistics
print("\n=== HCP Analysis ===")
print(f"Unique HCPs in shifts: {hcp_analysis['unique_counts']['shifts']}")
print(f"Unique HCPs in bookings: {hcp_analysis['unique_counts']['bookings']}")
print(f"Unique HCPs in cancellations: {hcp_analysis['unique_counts']['cancellations']}")
print("\nHCP Overlaps:")
for overlap_type, count in hcp_analysis['overlaps'].items():
    print(f"{overlap_type}: {count}")

print("\n=== Facility Analysis ===")
print(f"Unique facilities in shifts: {facility_analysis['unique_counts']['shifts']}")
print(f"Unique facilities in bookings: {facility_analysis['unique_counts']['bookings']}")
print(f"Unique facilities in cancellations: {facility_analysis['unique_counts']['cancellations']}")

print("\nFacility Statistics Summary:")
print(facility_analysis['facility_stats'].describe())

In [None]:
# look here for patterns of the worst and best HCPs and Facilities
# to do check this and move it. numbers don't seem right 
# once checked, add to summary 
import pandas as pd
import numpy as np
from datetime import datetime

def analyze_hcp_reliability(shifts_df, bookings_df, cancellations_df):
    """
    Analyze HCP reliability patterns, identifying best and worst performers
    
    Returns DataFrame with metrics per HCP including:
    - Total shifts worked
    - Cancellation rate
    - No-show rate
    - Average lead time for cancellations
    - Types of shifts typically worked/cancelled
    """
    # Prepare HCP metrics
    hcp_metrics = pd.DataFrame()
    
    # Count verified shifts per HCP
    verified_shifts = shifts_df[shifts_df['Verified'] == True].groupby('Agent ID').size()
    hcp_metrics['total_shifts_worked'] = verified_shifts
    
    # Calculate cancellation metrics per HCP
    cancellations_by_hcp = cancellations_df.groupby('Worker ID').agg({
        'ID': 'count',  # Total cancellations
        'Lead Time': ['mean', 'median'],  # Timing patterns
        'Action': lambda x: (x == 'NO_CALL_NO_SHOW').mean()  # No-show rate
    })
    
    cancellations_by_hcp.columns = ['total_cancellations', 'avg_lead_time', 'median_lead_time', 'no_show_rate']
    
    # Merge metrics
    hcp_metrics = hcp_metrics.join(cancellations_by_hcp, how='outer')
    
    # Calculate reliability score
    hcp_metrics['cancellation_rate'] = hcp_metrics['total_cancellations'] / (hcp_metrics['total_shifts_worked'] + hcp_metrics['total_cancellations'])
    hcp_metrics['reliability_score'] = (1 - hcp_metrics['cancellation_rate']) * (1 - hcp_metrics['no_show_rate'])
    
    # Fill NaN values appropriately
    hcp_metrics = hcp_metrics.fillna({
        'total_shifts_worked': 0,
        'total_cancellations': 0,
        'cancellation_rate': 0,
        'no_show_rate': 0,
        'reliability_score': 1  # Perfect score for those with no cancellations
    })
    
    return hcp_metrics

def analyze_facility_reliability(shifts_df, bookings_df, cancellations_df):
    """
    Analyze facility patterns, identifying stable and unstable facilities
    
    Returns DataFrame with metrics per facility including:
    - Total shifts posted
    - Deletion rate
    - Booking rate
    - Average time to fill shifts
    - Cancellation exposure
    """
    facility_metrics = pd.DataFrame()
    
    # Analyze shift postings and deletions
    facility_shifts = shifts_df.groupby('Facility ID').agg({
        'ID': 'count',  # Total shifts posted
        'Deleted': lambda x: x.notna().sum(),  # Deleted shifts
        'Verified': 'sum',  # Worked shifts
        'Charge': 'mean'  # Average pay rate
    })
    
    facility_metrics['total_shifts_posted'] = facility_shifts['ID']
    facility_metrics['deleted_shifts'] = facility_shifts['Deleted']
    facility_metrics['verified_shifts'] = facility_shifts['Verified']
    facility_metrics['avg_pay_rate'] = facility_shifts['Charge']
    
    # Calculate deletion rate
    facility_metrics['deletion_rate'] = facility_metrics['deleted_shifts'] / facility_metrics['total_shifts_posted']
    
    # Calculate booking success (excluding deleted shifts)
    active_shifts = facility_metrics['total_shifts_posted'] - facility_metrics['deleted_shifts']
    facility_metrics['booking_rate'] = facility_metrics['verified_shifts'] / active_shifts
    
    # Analyze cancellations exposure
    cancellations_by_facility = cancellations_df.groupby('Facility ID').agg({
        'ID': 'count',  # Total cancellations
        'Action': lambda x: (x == 'NO_CALL_NO_SHOW').sum()  # No-shows
    })
    
    facility_metrics['total_cancellations'] = cancellations_by_facility['ID']
    facility_metrics['total_no_shows'] = cancellations_by_facility['Action']
    
    # Fill NaN values
    facility_metrics = facility_metrics.fillna({
        'total_cancellations': 0,
        'total_no_shows': 0
    })
    
    # Calculate overall stability score
    facility_metrics['stability_score'] = (
        (1 - facility_metrics['deletion_rate']) * 
        facility_metrics['booking_rate'] * 
        (1 - facility_metrics['total_cancellations'] / facility_metrics['total_shifts_posted'])
    )
    
    return facility_metrics

# Run analyses
hcp_performance = analyze_hcp_reliability(shifts_df, bookings_df, cancellations_df)
facility_performance = analyze_facility_reliability(shifts_df, bookings_df, cancellations_df)

# Print summary of top and bottom performers
print("\n=== Top Performing HCPs (Minimum 10 shifts) ===")
reliable_hcps = hcp_performance[hcp_performance['total_shifts_worked'] >= 10]
print(reliable_hcps.nlargest(5, 'reliability_score'))

print("\n=== Struggling HCPs (Minimum 10 shifts) ===")
print(reliable_hcps.nsmallest(5, 'reliability_score'))

print("\n=== Most Stable Facilities (Minimum 20 shifts) ===")
active_facilities = facility_performance[facility_performance['total_shifts_posted'] >= 20]
print(active_facilities.nlargest(5, 'stability_score'))

print("\n=== Less Stable Facilities (Minimum 20 shifts) ===")
print(active_facilities.nsmallest(5, 'stability_score'))

# Calculate key statistics for potential interventions
print("\n=== Key Marketplace Statistics ===")
print(f"Overall marketplace cancellation rate: {(cancellations_df['ID'].count() / bookings_df['ID'].count()):.2%}")
print(f"Overall shift deletion rate: {(shifts_df['Deleted'].notna().sum() / len(shifts_df)):.2%}")
print(f"No-show percentage of cancellations: {(cancellations_df['Action'] == 'NO_CALL_NO_SHOW').mean():.2%}")

# Additional pattern analysis
hcp_performance['shift_volume_category'] = pd.qcut(
    hcp_performance['total_shifts_worked'], 
    q=4, 
    labels=['Low', 'Medium-Low', 'Medium-High', 'High']
)

volume_reliability = hcp_performance.groupby('shift_volume_category')['reliability_score'].mean()
print("\n=== Reliability by HCP Volume ===")
print(volume_reliability)

# Data Dive (formerly Data Quality Analysis) 

## Cancellations and Lead time analysis 

In [None]:
# note date ranges are all equal now 
# 
# 41k unique shifts, 127k unique bookings, 78k unique cancels -> 41 + 78 ~ 120 seems reasonable but lots of 
# this means each shift is booked 3 times and canceled twice? 

# === Lead Time Analysis ===
print("Analyzing lead times and cancellation patterns...")

# Clean lead times and get quality stats
clean_cancellations, quality_stats = clean_lead_times(cancellations_df)

# Basic cancellation metrics 
shifts_with_cancellations = len(set(shifts_df['ID']) & set(cancellations_df['Shift ID']))
print(f"Shifts with cancellations: {shifts_with_cancellations}")
# to do - this may be incorrect due to date overlap 
print(f"Percentage of shifts cancelled: {(shifts_with_cancellations/len(shifts_df))*100:.2f}%")

# start 
print(f"Overlap only here")
# Find overlapping date range
shifts_start = shifts_df['Created At'].min()
shifts_end = shifts_df['Created At'].max()
cancellations_start = cancellations_df['Created At'].min()
cancellations_end = cancellations_df['Created At'].max()

# Get the overlapping range
overlap_start = max(shifts_start, cancellations_start)
overlap_end = min(shifts_end, cancellations_end)

# Filter both datasets to overlapping period
shifts_in_range = shifts_df[
    (shifts_df['Created At'] >= overlap_start) & 
    (shifts_df['Created At'] <= overlap_end)
]

cancellations_in_range = cancellations_df[
    (cancellations_df['Created At'] >= overlap_start) & 
    (cancellations_df['Created At'] <= overlap_end)
]

# Calculate metrics using filtered data
shifts_with_cancellations = len(set(shifts_in_range['ID']) & set(cancellations_in_range['Shift ID']))
total_shifts_in_range = len(shifts_in_range)

cancellation_percentage = (shifts_with_cancellations / total_shifts_in_range) * 100

print(f"Date range analyzed: {overlap_start.date()} to {overlap_end.date()}")
print(f"Shifts with cancellations: {shifts_with_cancellations}")
print(f"Total shifts in range: {total_shifts_in_range}")
print(f"Percentage of shifts cancelled: {cancellation_percentage:.2f}%")
#end 


print("\n=== Data Quality Statistics ===")
print(quality_stats)

print("\n=== Lead Time Distribution ===")
print("Overall Lead Time Statistics:")
print(cancellations_df['Lead Time'].describe().round(2))

print("\n=== Cancellation Categories ===")
print(clean_cancellations['cancellation_category'].value_counts().sort_index())

print("\n=== Extreme Values Analysis ===")
print(f"Very Late Cancellations (>3 days after): {clean_cancellations['is_extreme_negative'].sum()}")
print(f"Very Early Cancellations (>41 days before): {clean_cancellations['is_extreme_positive'].sum()}")

# Distribution of lead times for extreme cases
if clean_cancellations['is_extreme_negative'].any():
    print("\nVery Late Cancellation Stats:")
    print(clean_cancellations[clean_cancellations['is_extreme_negative']]['Lead Time'].describe())

if clean_cancellations['is_extreme_positive'].any():
    print("\nVery Early Cancellation Stats:")
    print(clean_cancellations[clean_cancellations['is_extreme_positive']]['Lead Time'].describe())

# Store results in summary
summary.add_summary('cancellations', 'quality_stats', quality_stats.to_dict())
summary.add_summary('cancellations', 'extreme_values', {
    'very_late': clean_cancellations['is_extreme_negative'].sum(),
    'very_early': clean_cancellations['is_extreme_positive'].sum()
})

In [None]:
# move this one too
def analyze_temporal_patterns(cancellations_df, shifts_df):
    """Analyze patterns related to time in cancellations"""
    
    # Convert timestamps and calculate time-based features
    cancellations_df['Created At'] = pd.to_datetime(cancellations_df['Created At'])
    cancellations_df['hour'] = cancellations_df['Created At'].dt.hour
    cancellations_df['day_of_week'] = cancellations_df['Created At'].dt.day_name()
    cancellations_df['Lead Time'] = pd.to_numeric(cancellations_df['Lead Time'])
    
    # Analyze cancellations by hour
    hourly_cancellations = cancellations_df.groupby('hour').size()
    hourly_rate = hourly_cancellations / len(cancellations_df) * 100
    
    # Analyze cancellations by day of week
    daily_cancellations = cancellations_df.groupby('day_of_week').size()
    
    # Analyze lead time distribution
    lead_time_stats = {
        'mean_lead_time': cancellations_df['Lead Time'].mean(),
        'median_lead_time': cancellations_df['Lead Time'].median(),
        'late_cancels': (cancellations_df['Lead Time'] < 4).sum(),
        'late_cancel_rate': (cancellations_df['Lead Time'] < 4).mean() * 100
    }
    
    return {
        'hourly_patterns': hourly_rate,
        'daily_patterns': daily_cancellations,
        'lead_time_stats': lead_time_stats
    }

def analyze_worker_patterns(cancellations_df):
    """Analyze patterns in worker behavior"""
    
    # Calculate cancellations per worker
    worker_cancellations = cancellations_df.groupby('Worker ID').size()
    
    # Analyze repeat cancellation behavior
    cancellation_frequency = {
        'single_cancel': (worker_cancellations == 1).sum(),
        'repeat_cancels': (worker_cancellations > 1).sum(),
        'high_frequency': (worker_cancellations > 5).sum(),
        'max_cancels': worker_cancellations.max()
    }
    
    # Analyze NCNS patterns
    ncns_workers = cancellations_df[cancellations_df['Action'] == 'NO_CALL_NO_SHOW']['Worker ID'].nunique()
    
    return {
        'cancellation_frequency': cancellation_frequency,
        'ncns_unique_workers': ncns_workers
    }

def analyze_shift_characteristics(shifts_df, cancellations_df):
    """Analyze patterns in shift characteristics that lead to cancellations"""
    
    # Identify cancelled shifts
    cancelled_shift_ids = cancellations_df['Shift ID'].unique()
    cancelled_shifts = shifts_df[shifts_df['ID'].isin(cancelled_shift_ids)]
    non_cancelled_shifts = shifts_df[~shifts_df['ID'].isin(cancelled_shift_ids)]
    
    # Analyze pay rates
    pay_comparison = {
        'cancelled_avg_pay': cancelled_shifts['Charge'].mean(),
        'non_cancelled_avg_pay': non_cancelled_shifts['Charge'].mean(),
        'pay_difference': cancelled_shifts['Charge'].mean() - non_cancelled_shifts['Charge'].mean()
    }
    
    # Analyze shift duration
    duration_comparison = {
        'cancelled_avg_duration': cancelled_shifts['Time'].mean(),
        'non_cancelled_avg_duration': non_cancelled_shifts['Time'].mean()
    }
    
    # Analyze shift types
    shift_type_rates = pd.DataFrame({
        'cancelled': cancelled_shifts['Shift Type'].value_counts(normalize=True),
        'non_cancelled': non_cancelled_shifts['Shift Type'].value_counts(normalize=True)
    })
    
    return {
        'pay_patterns': pay_comparison,
        'duration_patterns': duration_comparison,
        'shift_type_patterns': shift_type_rates
    }

def analyze_facility_impact(shifts_df, cancellations_df):
    """Analyze patterns in facility impact"""
    
    # Calculate cancellation rates by facility
    facility_cancellations = cancellations_df.groupby('Facility ID').size()
    facility_shifts = shifts_df.groupby('Facility ID').size()
    
    facility_cancel_rates = (facility_cancellations / facility_shifts * 100).fillna(0)
    
    # Analyze facilities with high cancellation rates
    high_impact_facilities = facility_cancel_rates[facility_cancel_rates > facility_cancel_rates.median()]
    
    return {
        'facility_cancel_rates': facility_cancel_rates.describe(),
        'high_impact_count': len(high_impact_facilities)
    }

# Run all analyses
temporal_patterns = analyze_temporal_patterns(cancellations_df, shifts_df)
worker_patterns = analyze_worker_patterns(cancellations_df)
shift_patterns = analyze_shift_characteristics(shifts_df, cancellations_df)
facility_impact = analyze_facility_impact(shifts_df, cancellations_df)

# Visualization of key patterns
plt.figure(figsize=(20, 15))

# Plot 1: Hourly cancellation distribution
plt.subplot(2, 2, 1)
temporal_patterns['hourly_patterns'].plot(kind='bar')
plt.title('Cancellations by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Percentage of Cancellations')

# Plot 2: Lead time distribution
plt.subplot(2, 2, 2)
plt.hist(cancellations_df['Lead Time'], bins=50)
plt.title('Distribution of Cancellation Lead Times')
plt.xlabel('Lead Time (hours)')
plt.ylabel('Count')

# Plot 3: Worker cancellation frequency
plt.subplot(2, 2, 3)
worker_cancel_counts = cancellations_df.groupby('Worker ID').size()
plt.hist(worker_cancel_counts, bins=30)
plt.title('Distribution of Cancellations per Worker')
plt.xlabel('Number of Cancellations')
plt.ylabel('Number of Workers')

# Plot 4: Pay rate comparison
plt.subplot(2, 2, 4)
cancelled_shifts = shifts_df[shifts_df['ID'].isin(cancellations_df['Shift ID'])]
non_cancelled_shifts = shifts_df[~shifts_df['ID'].isin(cancellations_df['Shift ID'])]
plt.boxplot([cancelled_shifts['Charge'], non_cancelled_shifts['Charge']], 
            labels=['Cancelled Shifts', 'Completed Shifts'])
plt.title('Pay Rate Distribution: Cancelled vs Completed Shifts')
plt.ylabel('Charge Rate')

plt.tight_layout()

In [None]:
worker_cancel_counts

In [None]:
# move this to later 
def analyze_cancellation_patterns(shifts_df, cancellations_df, bookings_df):
    # Convert timestamps to datetime
    for df in [shifts_df, cancellations_df, bookings_df]:
        df['Created At'] = pd.to_datetime(df['Created At'])
    
    shifts_df['Start'] = pd.to_datetime(shifts_df['Start'])
    shifts_df['End'] = pd.to_datetime(shifts_df['End'])
    
    # Calculate key metrics
    total_shifts = len(shifts_df)
    cancelled_shifts = len(cancellations_df)
    cancellation_rate = cancelled_shifts / total_shifts * 100
    
    # Analyze cancellation lead times
    cancellations_df['Lead Time'] = pd.to_numeric(cancellations_df['Lead Time'])
    late_cancellations = cancellations_df[cancellations_df['Lead Time'] < 4]
    ncns = cancellations_df[cancellations_df['Action'] == 'NO_CALL_NO_SHOW']
    
    # Calculate repeat cancellation behavior
    worker_cancellation_counts = cancellations_df.groupby('Worker ID').size()
    repeat_cancellers = (worker_cancellation_counts > 1).sum()
    
    # Analyze cancellation patterns by time
    cancellations_df['Hour'] = cancellations_df['Created At'].dt.hour
    cancellations_by_hour = cancellations_df.groupby('Hour').size()
    
    # Analyze shift characteristics that lead to cancellations
    cancelled_shift_ids = cancellations_df['Shift ID'].unique()
    cancelled_shifts_data = shifts_df[shifts_df['ID'].isin(cancelled_shift_ids)]
    
    # Calculate average metrics
    avg_charge_cancelled = cancelled_shifts_data['Charge'].mean()
    avg_charge_all = shifts_df['Charge'].mean()
    
    # Analyze time between booking and cancellation
    merged_data = pd.merge(
        cancellations_df,
        bookings_df,
        on=['Shift ID', 'Worker ID'],
        suffixes=('_cancel', '_book')
    )
    merged_data['booking_to_cancel_hours'] = (
        merged_data['Created At_cancel'] - merged_data['Created At_book']
    ).dt.total_seconds() / 3600 #number of seconds in an hour
    
    # Generate summary statistics
    summary_stats = {
        'total_shifts': total_shifts,
        'cancellation_rate': cancellation_rate,
        'late_cancellation_rate': len(late_cancellations) / cancelled_shifts * 100,
        'ncns_rate': len(ncns) / cancelled_shifts * 100,
        'repeat_canceller_rate': repeat_cancellers / len(worker_cancellation_counts) * 100,
        'avg_lead_time': cancellations_df['Lead Time'].mean(),
        'median_lead_time': cancellations_df['Lead Time'].median(),
        'avg_charge_diff': avg_charge_cancelled - avg_charge_all,
        'avg_booking_to_cancel': merged_data['booking_to_cancel_hours'].mean()
    }
    
    return summary_stats

# Function to visualize patterns
def plot_cancellation_patterns(cancellations_df):
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Cancellations by hour
    plt.subplot(2, 2, 1)
    cancellations_df['Hour'] = cancellations_df['Created At'].dt.hour
    sns.histplot(data=cancellations_df, x='Hour', bins=24)
    plt.title('Cancellations by Hour of Day')
    
    # Plot 2: Lead time distribution
    plt.subplot(2, 2, 2)
    sns.histplot(data=cancellations_df, x='Lead Time', bins=50)
    plt.title('Distribution of Cancellation Lead Times')
    
    # Plot 3: Cancellations by day of week
    plt.subplot(2, 2, 3)
    cancellations_df['Day'] = cancellations_df['Created At'].dt.day_name()
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    sns.countplot(data=cancellations_df, x='Day', order=day_order)
    plt.xticks(rotation=45)
    plt.title('Cancellations by Day of Week')
    
    # Plot 4: Cumulative distribution of lead times
    plt.subplot(2, 2, 4)
    lead_times = pd.to_numeric(cancellations_df['Lead Time'])
    lead_times.sort_values().plot(kind='line')
    plt.title('Cumulative Distribution of Lead Times')
    
    plt.tight_layout()
    return plt

# Run analysis
# note: what is this?
summary_stats = analyze_cancellation_patterns(shifts_df, cancellations_df, bookings_df)
plots = plot_cancellation_patterns(cancellations_df)

In [None]:
# === Comprehensive Cancellation Analysis ===
print("=== Cancellation Type Analysis ===")
# Basic cancellation types
cancellation_types = cancellations_df['Action'].value_counts()
print("\nCancellation Action Types:")
print(cancellation_types)

# Detailed Pattern Analysis
def analyze_cancellation_patterns(clean_cancellations, shifts_df):
    """
    Analyze patterns in cancellations, including:
    - Types of cancellations (NCNS vs Regular)
    - Role-based patterns
    - Shift type patterns
    """
    # Merge with shifts to analyze by role
    cancellations_with_shifts = pd.merge(
        clean_cancellations,
        shifts_df[['ID', 'Agent Req', 'Shift Type', 'Charge']],
        left_on='Shift ID',
        right_on='ID',
        how='left'
    )
    
    print("\n=== Cancellations by Role ===")
    role_cancels = pd.crosstab(
        cancellations_with_shifts['Agent Req'],
        cancellations_with_shifts['cancellation_category'],
        normalize='index'
    ).round(3) * 100
    print(role_cancels)
    
    print("\n=== Cancellations by Shift Type ===")
    shift_cancels = pd.crosstab(
        cancellations_with_shifts['Shift Type'],
        cancellations_with_shifts['cancellation_category'],
        normalize='index'
    ).round(3) * 100
    print(shift_cancels)
    
    # Role-based impact analysis
    role_impact = cancellations_with_shifts.groupby('Agent Req').agg({
        'Shift ID': 'count',
        'Lead Time': ['mean', 'std'],
        'Action': lambda x: (x == 'NO_CALL_NO_SHOW').mean() * 100
    }).round(2)
    role_impact.columns = ['Total Cancellations', 'Avg Lead Time', 'Lead Time Std', 'NCNS Rate']
    print("\n=== Role-Based Impact ===")
    print(role_impact)
    
    # Store results
    summary.add_summary('cancellations', 'action_types', cancellation_types.to_dict())
    summary.add_summary('cancellations', 'role_patterns', role_cancels.to_dict())
    summary.add_summary('cancellations', 'shift_patterns', shift_cancels.to_dict())
    summary.add_summary('cancellations', 'role_impact', role_impact.to_dict())
    
    return cancellations_with_shifts

# Run the analysis
cancellations_with_shifts = analyze_cancellation_patterns(clean_cancellations, shifts_df)

## Shifts Data Dive 

In [None]:
# === Shift Data Numerical Analysis ===
print("\n=== Numerical Analysis: Basic statistics for numerical columns ===")
# Basic statistics for numerical columns
numeric_stats = shifts_df[['Charge', 'Time']].describe()
print("\nNumerical Statistics:")
print(numeric_stats)

# Additional numeric insights
print("\nCharge Rate Analysis:")
print(f"Shifts with zero charge: {(shifts_df['Charge'] == 0).sum()}")
print(f"Average charge by agent type:")
print(shifts_df.groupby('Agent Req')['Charge'].mean().round(2))

# === Categorical Analysis ===
print("\n=== Categorical Analysis ===")
# Shift type distribution
print("\nShift Type Distribution:")
shift_type_dist = shifts_df['Shift Type'].value_counts(dropna=True)
print(shift_type_dist)

# Agent requirements
print("\nAgent Requirement Distribution:")
agent_req_dist = shifts_df['Agent Req'].value_counts(dropna=True)
print(agent_req_dist)

# Cross-tabulation of shift types and agent requirements
print("\nShift Types by Agent Requirements:")
print(pd.crosstab(shifts_df['Shift Type'], shifts_df['Agent Req']))

# === Data Completeness Analysis ===
print("\n=== Data Completeness Analysis ===")
complete_rows = shifts_df.dropna().shape[0]
print(f"Complete rows: {complete_rows} out of {shifts_df.shape[0]}")
print(f"Completion rate: {(complete_rows/shifts_df.shape[0]*100):.2f}%")

# === Time-Based Analysis ===
print("\n=== Time-Based Analysis ===")
# Extract time components
shifts_df['Hour'] = shifts_df['Start'].dt.hour
shifts_df['Day'] = shifts_df['Start'].dt.day_name()
shifts_df['Month'] = shifts_df['Start'].dt.month
shifts_df['Shift_Length'] = (shifts_df['End'] - shifts_df['Start']).dt.total_seconds() / 3600
# Time patterns
print("\nShifts by Hour:")
hour_dist = shifts_df['Hour'].value_counts().sort_index()
print(hour_dist)

print("\nShifts by Day of Week:")
day_dist = shifts_df['Day'].value_counts()
print(day_dist)

print("\nShift Length Distribution:")
print(shifts_df['Shift_Length'].describe().round(2))

# === Facility Analysis ===
print("\n=== Facility Analysis ===")
facility_stats = shifts_df.groupby('Facility ID').agg({
    'ID': 'count',
    'Charge': 'mean',
    'Time': 'mean'
}).rename(columns={
    'ID': 'Number of Shifts',
    'Charge': 'Average Charge',
    'Time': 'Average Shift Length'
})
print("\nFacility Statistics:")
print(facility_stats.head())
print(f"\nTotal unique facilities: {shifts_df['Facility ID'].nunique()}")

# Store all results
summary.add_summary('shifts', 'numeric_stats', numeric_stats)
summary.add_summary('shifts', 'shift_types', shift_type_dist.to_dict())
summary.add_summary('shifts', 'agent_types', agent_req_dist.to_dict())
summary.add_summary('shifts', 'hour_distribution', hour_dist.to_dict())
summary.add_summary('shifts', 'day_distribution', day_dist.to_dict())
summary.add_summary('shifts', 'facility_stats', facility_stats.to_dict())

# Optional: Create visualizations
# We can add matplotlib/seaborn plots here if you'd like

In [None]:
# Relationship analysis
# First, let's see how many shifts had cancellations
shifts_with_cancellations = len(set(shifts_df['ID']) & set(cancellations_df['Shift ID']))
print(f"Shifts with cancellations: {shifts_with_cancellations}")
print(f"Percentage of shifts cancelled: {(shifts_with_cancellations/len(shifts_df))*100:.2f}%")

# Analyze cancellation lead times
cancellations_df['Lead Time'].describe()

## Booking Data Dive 

In [None]:
# === Booking Pattern Analysis ===
def analyze_booking_patterns(bookings_df, shifts_df, clean_cancellations):
    """
    Analyze patterns in shift bookings, including:
    - Time from posting to booking
    - Successful vs cancelled bookings
    - Rebooking patterns after cancellations
    
    Parameters:
        bookings_df (pd.DataFrame): Booking logs data
        shifts_df (pd.DataFrame): Shifts data
        clean_cancellations (pd.DataFrame): Cleaned cancellations data
    """
    print("=== Booking Success Analysis ===")

    # Calculate time to fill (from shift creation to booking)
    bookings_with_shifts = pd.merge(
        bookings_df,
        shifts_df[['ID', 'Created At', 'Agent Req', 'Shift Type', 'Charge']],
        left_on='Shift ID',
        right_on='ID',
        how='left',
        suffixes=('_booking', '_shift')
    )
    
    bookings_with_shifts['time_to_fill'] = (
        pd.to_datetime(bookings_with_shifts['Created At_booking']) - 
        pd.to_datetime(bookings_with_shifts['Created At_shift'])
    ).dt.total_seconds() / 3600  # Convert to hours
    
    print("\nTime to Fill Shift from listing Statistics (hours):")
    print(bookings_with_shifts['time_to_fill'].describe().round(2))
    
    # Analyze bookings by role and shift type
    print("\n=== Bookings by Role ===")
    role_bookings = bookings_with_shifts.groupby('Agent Req').agg({
        'Shift ID': 'count',  # Changed from 'ID' to 'Shift ID'
        'time_to_fill': 'mean',
        'Charge': 'mean'
    }).round(2)
    role_bookings.columns = ['Number of Bookings', 'Avg Time to Fill', 'Avg Charge']
    print(role_bookings)
    
    # Look at shifts that got cancelled and rebooked
    rebooked_cancellations = clean_cancellations['Shift ID'].value_counts()
    
    print("\n=== Rebooking Analysis ===")
    print(f"Shifts cancelled multiple times: {(rebooked_cancellations > 1).sum()}")
    print(f"Maximum cancellations for a single shift: {rebooked_cancellations.max()}")
    
    # Additional timing analysis
    print("\n=== Booking Time Patterns ===")
    bookings_with_shifts['booking_hour'] = pd.to_datetime(bookings_with_shifts['Created At_booking']).dt.hour
    bookings_with_shifts['booking_day'] = pd.to_datetime(bookings_with_shifts['Created At_booking']).dt.day_name()
    
    print("\nBookings by Hour of Day:")
    print(bookings_with_shifts['booking_hour'].value_counts().sort_index())
    
    print("\nBookings by Day of Week:")
    print(bookings_with_shifts['booking_day'].value_counts())
    
    # Store results
    summary.add_summary('bookings', 'time_to_fill', 
                       bookings_with_shifts['time_to_fill'].describe().to_dict())
    summary.add_summary('bookings', 'role_patterns', role_bookings.to_dict())
    summary.add_summary('bookings', 'rebooking_stats', {
        'multiple_cancellations': (rebooked_cancellations > 1).sum(),
        'max_cancellations': rebooked_cancellations.max()
    })
    
    return bookings_with_shifts

# Run the analysis
bookings_with_shifts = analyze_booking_patterns(bookings_df, shifts_df, clean_cancellations)    

## Economic Impact Analysis 

In [None]:
# === Economic Impact Analysis ===
def analyze_economic_impact(shifts_df, cancellations_with_shifts):
    """
    Analyze economic impact of cancellations, including:
    - Revenue loss from cancellations
    - Impact by facility and role type
    - Patterns in high-cost cancellations
    """
    print("=== Economic Impact Analysis ===")
    
    # First, ensure we have all needed columns by merging with shifts data if needed
    if 'Time' not in cancellations_with_shifts.columns:
        cancellations_with_shifts = pd.merge(
            cancellations_with_shifts,
            shifts_df[['ID', 'Time', 'Charge']],
            left_on='Shift ID',
            right_on='ID',
            how='left',
            suffixes=('', '_shift')
        )

    # Calculate baseline metrics
    total_revenue = (shifts_df['Charge'] * shifts_df['Time']).sum()
    avg_hourly_revenue = shifts_df['Charge'].mean()
    
    # Analyze cancelled shifts
    cancelled_revenue = (cancellations_with_shifts['Charge'] * 
                        cancellations_with_shifts['Time']).sum()
    
    print("\nBaseline Metrics:")
    print(f"Total Potential Revenue: ${total_revenue:,.2f}")
    print(f"Average Hourly Rate: ${avg_hourly_revenue:.2f}")
    print(f"Lost Revenue from Cancellations: ${cancelled_revenue:,.2f}")
    if total_revenue > 0:  # Avoid division by zero
        print(f"Percentage of Revenue Lost: {(cancelled_revenue/total_revenue)*100:.2f}%")

    # Analysis by role type
    print("\n=== Impact by Role Type ===")
    role_impact = cancellations_with_shifts.groupby('Agent Req').agg({
        'Shift ID': 'count',
        'Charge': ['mean', 'sum'],
        'Time': 'sum'
    }).round(2)
    role_impact.columns = ['Cancellations', 'Avg Rate', 'Total Charge', 'Total Hours']
    role_impact['Est. Revenue Loss'] = role_impact['Avg Rate'] * role_impact['Total Hours']
    print(role_impact.sort_values('Est. Revenue Loss', ascending=False))

    # Analysis by cancellation type
    print("\n=== Impact by Cancellation Type ===")
    type_impact = cancellations_with_shifts.groupby('cancellation_category').agg({
        'Shift ID': 'count',
        'Charge': ['mean', 'sum'],
        'Time': 'sum'
    }).round(2)
    type_impact.columns = ['Cancellations', 'Avg Rate', 'Total Charge', 'Total Hours']
    type_impact['Est. Revenue Loss'] = type_impact['Avg Rate'] * type_impact['Total Hours']
    print(type_impact.sort_values('Est. Revenue Loss', ascending=False))

    # Calculate impact by facility
    print("\n=== Top 5 Facilities by Revenue Loss ===")
    facility_impact = cancellations_with_shifts.groupby('Facility ID').agg({
        'Shift ID': 'count',
        'Charge': ['mean', 'sum'],
        'Time': 'sum'
    }).round(2)
    facility_impact.columns = ['Cancellations', 'Avg Rate', 'Total Charge', 'Total Hours']
    facility_impact['Est. Revenue Loss'] = facility_impact['Avg Rate'] * facility_impact['Total Hours']
    print(facility_impact.nlargest(5, 'Est. Revenue Loss'))

    # Store results
    summary.add_summary('economic', 'overall_impact', {
        'total_revenue': total_revenue,
        'cancelled_revenue': cancelled_revenue,
        'avg_hourly_rate': avg_hourly_revenue
    })
    summary.add_summary('economic', 'role_impact', role_impact.to_dict())
    summary.add_summary('economic', 'type_impact', type_impact.to_dict())

    return role_impact, type_impact, facility_impact

# Run the analysis
role_impact, type_impact, facility_impact = analyze_economic_impact(shifts_df, cancellations_with_shifts)

In [None]:
def audit_data_quality(shifts_df, cancellations_df, bookings_df):
    """
    Comprehensive data quality audit focusing on business-critical issues
    
    Parameters:
    - shifts_df: DataFrame containing shift data
    - cancellations_df: DataFrame containing cancellation logs
    - bookings_df: DataFrame containing booking logs
    
    Returns:
    - Dictionary containing quality issues by dataset
    """
    quality_issues = {
        'shifts': {},
        'cancellations': {},
        'bookings': {}
    }
    
    # Shifts Analysis
    shifts_issues = {
        # Financial data issues
        'zero_charge': (shifts_df['Charge'] == 0).sum(),
        'negative_charge': (shifts_df['Charge'] < 0).sum(),
        
        # Time-related issues
        'zero_time': (shifts_df['Time'] == 0).sum(),
        'negative_time': (shifts_df['Time'] < 0).sum(),
        'end_before_start': (shifts_df['End'] < shifts_df['Start']).sum(),
        
        # Missing data
        'missing_agent': shifts_df['Agent ID'].isnull().sum(),
        'missing_facility': shifts_df['Facility ID'].isnull().sum(),
        'missing_shift_type': shifts_df['Shift Type'].isnull().sum(),
        
        # Verification issues
        'unverified_completed': ((shifts_df['End'] < pd.Timestamp.now()) & 
                                (shifts_df['Verified'].isnull())).sum(),
        
        # Invalid shift types
        'invalid_shift_types': shifts_df[~shifts_df['Shift Type'].isin(['am', 'pm', 'noc', 'custom'])].shape[0]
    }
    
    # Cancellations Analysis
    cancel_issues = {
        # Lead time issues
        'invalid_lead_time': (cancellations_df['Lead Time'].isnull() | 
                            ~np.isfinite(cancellations_df['Lead Time'])).sum(),
        'extreme_negative_lead': (cancellations_df['Lead Time'] < -72).sum(),  # More than 3 days after start
        'extreme_positive_lead': (cancellations_df['Lead Time'] > 720).sum(),  # More than 30 days before
        
        # Missing data
        'missing_worker': cancellations_df['Worker ID'].isnull().sum(),
        'missing_facility': cancellations_df['Facility ID'].isnull().sum(),
        
        # Duplicate issues
        'duplicate_cancels': cancellations_df.groupby('Shift ID').size().gt(1).sum(),
        
        # Action type validation
        'invalid_actions': cancellations_df[~cancellations_df['Action'].isin(
            ['WORKER_CANCEL', 'NO_CALL_NO_SHOW'])].shape[0]
    }
    
    # Bookings Analysis
    bookings_issues = {
        # Missing data
        'missing_worker': bookings_df['Worker ID'].isnull().sum(),
        'missing_facility': bookings_df['Facility ID'].isnull().sum(),
        
        # Lead time issues (time between booking and shift start)
        'invalid_lead_time': (bookings_df['Lead Time'].isnull() | 
                            ~np.isfinite(bookings_df['Lead Time'])).sum(),
        
        # Action validation
        'invalid_actions': bookings_df[bookings_df['Action'] != 'SHIFT_CLAIM'].shape[0]
    }
    
    # Cross-dataset validation
    cross_validation = {
        'orphaned_cancels': cancellations_df[~cancellations_df['Shift ID'].isin(shifts_df['ID'])].shape[0],
        'orphaned_bookings': bookings_df[~bookings_df['Shift ID'].isin(shifts_df['ID'])].shape[0],
        'multiple_workers': shifts_df.groupby('ID')['Agent ID'].nunique().gt(1).sum(),
        'booking_cancel_mismatch': len(
            set(cancellations_df[cancellations_df['Action'] == 'WORKER_CANCEL']['Shift ID']) - 
            set(bookings_df['Shift ID'])
        )
    }
    
    quality_issues['shifts'] = shifts_issues
    quality_issues['cancellations'] = cancel_issues
    quality_issues['bookings'] = bookings_issues
    quality_issues['cross_validation'] = cross_validation
    
    return quality_issues

# Function to display audit results in a readable format
def display_audit_results(audit_results):
    """
    Display audit results in a clear, organized format
    """
    for dataset, issues in audit_results.items():
        print(f"\n=== {dataset.upper()} QUALITY ISSUES ===")
        for issue, count in issues.items():
            print(f"{issue}: {count:,}")


# === In Initial Data Loading and Validation Section ===

print("Performing data quality audit...")
# Run the audit
audit_results = audit_data_quality(shifts_df, cancellations_df, bookings_df)

# Display results
display_audit_results(audit_results)

# Store results in summary
summary.add_summary('data_quality', 'audit_results', audit_results)

# Optional: Display specific issues that need attention
significant_issues = {
    dataset: {issue: count for issue, count in issues.items() if count > 0}
    for dataset, issues in audit_results.items()
}

print("\nSignificant issues requiring attention:")
for dataset, issues in significant_issues.items():
    if issues:  # Only show datasets with issues
        print(f"\n{dataset}:")
        for issue, count in issues.items():
            print(f"- {issue}: {count:,}")

## This suggests that:

Many cancellations and bookings don't link to shifts in our dataset
Could be due to date range mismatches or data completeness issues
Critical for understanding true cancellation rates


Worker/Agent Data Gaps

Copymissing_agent: 20,035 (shifts)
missing_worker: 191 (cancellations)
missing_worker: 140 (bookings)
This matches what we saw earlier but gives us a more complete picture. Particularly important because:

About half of shifts are missing agent IDs
Affects our ability to analyze worker patterns
Could impact our ability to track repeat cancellations


Lead Time Issues

Copyextreme_negative_lead: 4,960
extreme_positive_lead: 741
This provides more granular insight than our earlier analysis. Important because:

Shows significant number of very late cancellations (>3 days after start)
Identifies early cancellations that might need different handling
Relevant to the attendance policy analysis


Financial Data Quality

Copyzero_charge: 5,019
zero_time: 79
negative_time: 22
Matches our earlier findings but gives more context about potential revenue impact.

In [None]:
# at this point realized the data doesn't overlap in dates 
# went back to the start to fix that 
def analyze_data_coverage():
    """
    Analyze the time coverage and relationships between datasets
    
    Returns:
    - Dictionary containing date ranges and overlap analysis for each dataset
    """
    # Get date ranges for each dataset
    print("Analyzing dataset date coverage...")
    
    shift_dates = shifts_df['Start'].dt.date.value_counts().sort_index()
    cancel_dates = cancellations_df['Created At'].dt.date.value_counts().sort_index()
    booking_dates = bookings_df['Created At'].dt.date.value_counts().sort_index()
    
    # Analyze overlap periods
    date_ranges = {
        'shifts': {
            'start': shift_dates.index.min(),
            'end': shift_dates.index.max(),
            'total_days': len(shift_dates),
            'avg_shifts_per_day': shift_dates.mean()
        },
        'cancellations': {
            'start': cancel_dates.index.min(),
            'end': cancel_dates.index.max(),
            'total_days': len(cancel_dates),
            'avg_cancels_per_day': cancel_dates.mean()
        },
        'bookings': {
            'start': booking_dates.index.min(),
            'end': booking_dates.index.max(),
            'total_days': len(booking_dates),
            'avg_bookings_per_day': booking_dates.mean()
        }
    }
    
    return date_ranges

def analyze_missing_data_impact():
    """
    Assess how missing data affects our key metrics
    
    Returns:
    - Dictionary containing comparative analysis of shifts with/without missing data
    """
    print("\nAnalyzing impact of missing data...")
    
    # Analyze shifts with/without missing agent IDs
    missing_agent_shifts = shifts_df[shifts_df['Agent ID'].isnull()]
    complete_shifts = shifts_df[shifts_df['Agent ID'].notnull()]
    
    # Get cancellation rates
    missing_cancels = len(set(missing_agent_shifts['ID']) & set(cancellations_df['Shift ID']))
    complete_cancels = len(set(complete_shifts['ID']) & set(cancellations_df['Shift ID']))
    
    comparison = {
        'missing_agent': {
            'count': len(missing_agent_shifts),
            'avg_charge': missing_agent_shifts['Charge'].mean(),
            'avg_duration': missing_agent_shifts['Time'].mean(),
            'cancellation_count': missing_cancels,
            'cancellation_rate': missing_cancels / len(missing_agent_shifts) if len(missing_agent_shifts) > 0 else 0,
            'verified_rate': missing_agent_shifts['Verified'].mean()
        },
        'complete_data': {
            'count': len(complete_shifts),
            'avg_charge': complete_shifts['Charge'].mean(),
            'avg_duration': complete_shifts['Time'].mean(),
            'cancellation_count': complete_cancels,
            'cancellation_rate': complete_cancels / len(complete_shifts) if len(complete_shifts) > 0 else 0,
            'verified_rate': complete_shifts['Verified'].mean()
        }
    }
    
    return comparison

# Run both analyses
print("Running additional data quality analyses...\n")

# Analyze data coverage
coverage_results = analyze_data_coverage()
print("\n=== Dataset Coverage Analysis ===")
for dataset, info in coverage_results.items():
    print(f"\n{dataset.upper()} Coverage:")
    for metric, value in info.items():
        print(f"{metric}: {value}")

# Analyze missing data impact
impact_results = analyze_missing_data_impact()
print("\n=== Missing Data Impact Analysis ===")
for category, metrics in impact_results.items():
    print(f"\n{category.replace('_', ' ').title()}:")
    for metric, value in metrics.items():
        if 'rate' in metric:
            print(f"{metric}: {value:.2%}")
        else:
            print(f"{metric}: {value:,.2f}")

# Store results in summary
summary.add_summary('data_quality', 'coverage_analysis', coverage_results)
summary.add_summary('data_quality', 'missing_data_impact', impact_results)

In [None]:
def analyze_data_completeness(shifts_df, cancellations_df, bookings_df, verbose=True):
    """
    Analyze dataset completeness and coverage periods.
    
    Parameters:
    ----------
    shifts_df : pandas.DataFrame
        Shift data containing columns: 'Start', 'Agent ID', etc.
    cancellations_df : pandas.DataFrame
        Cancellation data containing columns: 'Created At', etc.
    bookings_df : pandas.DataFrame
        Booking data containing columns: 'Created At', etc.
    verbose : bool, default=True
        If True, prints detailed analysis results
        
    Returns:
    --------
    dict
        Dictionary containing coverage analysis and completeness metrics
    """
    results = {}
    
    # Dataset Coverage Analysis
    coverage = {
        'shifts': {
            'date_range': (shifts_df['Start'].min(), shifts_df['End'].max()),
            'total_records': len(shifts_df),
            'daily_average': len(shifts_df) / shifts_df['Start'].dt.date.nunique()
        },
        'cancellations': {
            'date_range': (cancellations_df['Created At'].min(), 
                         cancellations_df['Created At'].max()),
            'total_records': len(cancellations_df),
            'daily_average': len(cancellations_df) / cancellations_df['Created At'].dt.date.nunique()
        },
        'bookings': {
            'date_range': (bookings_df['Created At'].min(), 
                         bookings_df['Created At'].max()),
            'total_records': len(bookings_df),
            'daily_average': len(bookings_df) / bookings_df['Created At'].dt.date.nunique()
        }
    }
    results['coverage'] = coverage
    
    # Data Completeness Analysis
    completeness = {
        'shifts': {
            'missing_agent_id': {
                'count': shifts_df['Agent ID'].isnull().sum(),
                'percentage': (shifts_df['Agent ID'].isnull().sum() / len(shifts_df)) * 100
            },
            'verified_shifts': {
                'count': shifts_df['Verified'].sum(),
                'percentage': (shifts_df['Verified'].sum() / len(shifts_df)) * 100
            }
        },
        'cancellations': {
            'missing_worker_id': {
                'count': cancellations_df['Worker ID'].isnull().sum(),
                'percentage': (cancellations_df['Worker ID'].isnull().sum() / len(cancellations_df)) * 100
            }
        },
        'bookings': {
            'missing_worker_id': {
                'count': bookings_df['Worker ID'].isnull().sum(),
                'percentage': (bookings_df['Worker ID'].isnull().sum() / len(bookings_df)) * 100
            }
        }
    }
    results['completeness'] = completeness
    
    if verbose:
        print("=== Dataset Coverage Analysis ===")
        print("\nTime Periods:")
        for dataset, info in coverage.items():
            print(f"\n{dataset.upper()}:")
            print(f"Date Range: {info['date_range'][0].date()} to {info['date_range'][1].date()}")
            print(f"Total Records: {info['total_records']:,}")
            print(f"Daily Average: {info['daily_average']:.2f}")
        
        print("\n=== Data Completeness Analysis ===")
        for dataset, metrics in completeness.items():
            print(f"\n{dataset.upper()} Completeness:")
            for field, values in metrics.items():
                print(f"{field}:")
                print(f"  Count: {values['count']:,}")
                print(f"  Percentage: {values['percentage']:.2f}%")
    
    return results

# Run the analysis
completeness_results = analyze_data_completeness(shifts_df, cancellations_df, bookings_df)

# Business Critical Questions

In [None]:
def analyze_missing_agent_patterns(shifts_df):
    """
    Analyze patterns in shifts with missing Agent IDs
    
    Parameters:
    shifts_df: DataFrame containing shift data
    
    Returns:
    Dictionary containing analysis results
    """
    # Separate shifts with/without Agent IDs
    missing_agent = shifts_df[shifts_df['Agent ID'].isnull()]
    has_agent = shifts_df[shifts_df['Agent ID'].notnull()]
    
    analysis = {
        'temporal_patterns': {
            'missing_by_month': missing_agent['Start'].dt.to_period('M').value_counts().sort_index(),
            'missing_by_dow': missing_agent['Start'].dt.day_name().value_counts(),
            'missing_by_shift_type': missing_agent['Shift Type'].value_counts()
        },
        
        'verification_status': {
            'missing_verified': missing_agent['Verified'].value_counts(),
            'has_agent_verified': has_agent['Verified'].value_counts()
        },
        
        'facility_patterns': {
            'facilities_missing': missing_agent['Facility ID'].value_counts(),
            'missing_rate_by_facility': (
                missing_agent.groupby('Facility ID').size() / 
                shifts_df.groupby('Facility ID').size()
            ).sort_values(ascending=False)
        },
        
        'charge_comparison': {
            'missing_charges': missing_agent['Charge'].describe(),
            'has_agent_charges': has_agent['Charge'].describe()
        }
    }
    
    print("=== Analysis of Shifts with Missing Agent IDs ===\n")
    print(f"Total Shifts: {len(shifts_df):,}")
    print(f"Shifts Missing Agent ID: {len(missing_agent):,} ({len(missing_agent)/len(shifts_df):.1%})")
    print(f"Shifts with Agent ID: {len(has_agent):,} ({len(has_agent)/len(shifts_df):.1%})")
    
    print("\n=== Verification Status ===")
    print("\nShifts Missing Agent ID:")
    print(analysis['verification_status']['missing_verified'])
    print("\nShifts with Agent ID:")
    print(analysis['verification_status']['has_agent_verified'])
    
    print("\n=== Shift Type Distribution (Missing Agent ID) ===")
    print(analysis['temporal_patterns']['missing_by_shift_type'])
    
    print("\n=== Top 5 Facilities with Missing Agent IDs ===")
    print("Count:")
    print(analysis['facility_patterns']['facilities_missing'].head())
    print("\nRate:")
    print(analysis['facility_patterns']['missing_rate_by_facility'].head())
    
    return analysis

"""This analysis should help us:

Identify patterns in missing Agent IDs
See if certain facilities have more missing IDs
Compare verification rates
Understand if missing IDs are random or systematic """
# Run the analysis
missing_agent_analysis = analyze_missing_agent_patterns(shifts_df)

In [None]:
# First, let's verify the data structure
overlap_start = shifts_df['Start'].dt.date.min()
overlap_end = shifts_df['Start'].dt.date.max()

# Print shapes and types for debugging
print("Overlap period:", overlap_start, "to", overlap_end)
print("\nDataset shapes:")
print(f"Shifts: {overlap_shifts.shape}")
print(f"Cancellations: {overlap_cancels.shape}")
print(f"Bookings: {overlap_bookings.shape}")

# Let's check a simpler version of the worker calculation first
def analyze_worker_basic(shifts, cancels):
    """Simplified version to debug the core calculation"""
    # Get workers with shifts
    workers = shifts[shifts['Agent ID'].notnull()]['Agent ID'].unique()
    
    # Create base metrics
    metrics = pd.DataFrame(index=workers)
    
    # Calculate basic stats
    shift_counts = shifts[shifts['Agent ID'].notnull()].groupby('Agent ID')['ID'].count()
    cancel_counts = cancels.groupby('Worker ID').size()
    
    # Ensure matching indices
    metrics['total_shifts'] = shift_counts
    metrics['cancellations'] = cancel_counts.reindex(workers).fillna(0)
    metrics['reliability'] = 1 - (metrics['cancellations'] / metrics['total_shifts'])
    
    return metrics

# Try the simplified version
test_metrics = analyze_worker_basic(overlap_shifts, overlap_cancels)
print("\nTest metrics head:")
print(test_metrics.head())

In [None]:
def analyze_business_patterns(start_date='2021-10-01', end_date='2022-01-31'):
    """
    Analyze business patterns within the core overlapping period
    
    Parameters:
    - start_date: Beginning of analysis period
    - end_date: End of analysis period
    """
    # Filter to overlapping period
    mask_period = lambda df: (
        df['Created At'].dt.date >= pd.to_datetime(start_date).date() &
        df['Created At'].dt.date <= pd.to_datetime(end_date).date()
    )
    
    shifts_filtered = shifts_df[shifts_df['Start'].dt.date.between(start_date, end_date)]
    cancels_filtered = cancellations_df[mask_period(cancellations_df)]
    bookings_filtered = bookings_df[mask_period(bookings_df)]
    
    # Time-based patterns
    time_patterns = {
        'hourly_patterns': pd.DataFrame({
            'cancellations': cancels_filtered['Created At'].dt.hour.value_counts().sort_index(),
            'bookings': bookings_filtered['Created At'].dt.hour.value_counts().sort_index()
        }).fillna(0),
        
        'daily_patterns': pd.DataFrame({
            'shifts': shifts_filtered['Start'].dt.day_name().value_counts(),
            'cancellations': cancels_filtered['Created At'].dt.day_name().value_counts(),
            'bookings': bookings_filtered['Created At'].dt.day_name().value_counts()
        }).fillna(0),
        
        'lead_time_success': bookings_filtered[
            ~bookings_filtered['Shift ID'].isin(cancels_filtered['Shift ID'])
        ]['Lead Time'].describe()
    }
    
    # Worker reliability - separating by data completeness
    worker_patterns = {
        'complete_data': analyze_worker_patterns(
            shifts_filtered[shifts_filtered['Agent ID'].notnull()],
            cancels_filtered,
            bookings_filtered
        ),
        'missing_data': analyze_worker_patterns(
            shifts_filtered[shifts_filtered['Agent ID'].isnull()],
            cancels_filtered,
            bookings_filtered
        )
    }
    
    # Facility analysis
    facility_patterns = {
        'cancel_rates': calculate_facility_metrics(
            shifts_filtered, cancels_filtered, bookings_filtered
        ),
        'ncns_impact': analyze_ncns_impact(
            shifts_filtered, cancels_filtered
        )
    }
    
    return {
        'time_patterns': time_patterns,
        'worker_patterns': worker_patterns,
        'facility_patterns': facility_patterns
    }

def analyze_worker_patterns(shifts, cancels, bookings):
    """
    Analyze comprehensive booking and cancellation patterns at the worker level
    
    Parameters:
    -----------
    shifts : pd.DataFrame
        Shift data including verified status and worker information
    cancels : pd.DataFrame
        Cancellation data with timing and reason information
    bookings : pd.DataFrame
        Booking data with lead times and worker details
        
    Returns:
    -----------
    dict
        Dictionary containing detailed worker behavior analysis
    """
    # Start with valid workers only
    active_workers = shifts[shifts['Agent ID'].notnull()]['Agent ID'].unique()
    
    # Initialize success metrics DataFrame first
    success_metrics = pd.DataFrame(index=active_workers)
    
    # Basic counts
    shift_counts = shifts[shifts['Agent ID'].notnull()].groupby('Agent ID')['ID'].count()
    cancel_counts = cancels.groupby('Worker ID').size()
    ncns_counts = cancels[cancels['Action'] == 'NO_CALL_NO_SHOW'].groupby('Worker ID').size()
    
    # Add basic metrics with proper index alignment
    success_metrics['total_shifts'] = shift_counts
    success_metrics['cancellations'] = cancel_counts.reindex(active_workers).fillna(0)
    success_metrics['ncns'] = ncns_counts.reindex(active_workers).fillna(0)
    
    # Calculate derived metrics
    success_metrics['reliability'] = 1 - (success_metrics['cancellations'] / success_metrics['total_shifts'])
    success_metrics['ncns_rate'] = success_metrics['ncns'] / success_metrics['cancellations'].replace(0, 1)
    
    # Add verification rate
    verify_rate = shifts[shifts['Agent ID'].notnull()].groupby('Agent ID')['Verified'].mean()
    success_metrics['completion_rate'] = verify_rate
    
    # Add charge and time metrics
    charge_stats = shifts[shifts['Agent ID'].notnull()].groupby('Agent ID')['Charge'].agg(['mean', 'std'])
    time_stats = shifts[shifts['Agent ID'].notnull()].groupby('Agent ID')['Time'].agg(['mean', 'std'])
    
    success_metrics['avg_charge'] = charge_stats['mean']
    success_metrics['consistency'] = 1 - (time_stats['std'] / time_stats['mean'].replace(0, np.inf))
    
    # Pre-calculate datetime features for patterns
    bookings = bookings.copy()
    bookings['booking_hour'] = bookings['Created At'].dt.hour
    bookings['booking_day'] = bookings['Created At'].dt.day_name()
    
    # Analyze booking patterns
    booking_patterns = {
        'time_preferences': {
            'booking_hours': bookings.groupby('Worker ID')['booking_hour'].value_counts(),
            'booking_days': bookings.groupby('Worker ID')['booking_day'].value_counts(),
            'lead_time_stats': bookings.groupby('Worker ID')['Lead Time'].describe()
        },
        'shift_preferences': {
            'shift_types': shifts[shifts['Agent ID'].isin(active_workers)].groupby(
                ['Agent ID', 'Shift Type']).size().unstack(fill_value=0),
            'facility_choices': shifts[shifts['Agent ID'].isin(active_workers)].groupby(
                ['Agent ID', 'Facility ID']).size().unstack(fill_value=0)
        }
    }
    
    # Calculate overall score
    weights = {
        'completion_rate': 0.4,
        'reliability': 0.3,
        'consistency': 0.2,
        'avg_charge': 0.1
    }
    
    # Normalize any missing columns
    available_metrics = [m for m in weights.keys() if m in success_metrics.columns]
    weight_sum = sum(weights[m] for m in available_metrics)
    
    success_metrics['overall_score'] = sum(
        success_metrics[metric] * (weights[metric] / weight_sum)
        for metric in available_metrics
    )
    
    return {
        'success_metrics': success_metrics,
        'booking_patterns': booking_patterns
    }
"""Worker Profiles:

Creates a baseline profile for each worker using shift data
Captures basic metrics like total shifts, verification rates, and pricing patterns


Booking Patterns:

Analyzes when workers prefer to book shifts (time of day, day of week)
Examines lead time patterns
Identifies preferences for shift types and facilities


Cancellation Patterns:

Studies when cancellations typically occur
Calculates cancellation rates and no-show rates
Analyzes lead times for cancellations


Success Metrics:

Combines multiple factors into an overall worker score
Uses weighted metrics for completion, reliability, consistency, and earnings
Allows for customization of weights based on business priorities"""

def calculate_facility_metrics(shifts, cancels, bookings):
    """Calculate key facility metrics"""
    return {
        'cancel_rates': (cancels.groupby('Facility ID').size() / 
                        shifts.groupby('Facility ID').size()),
        'rebooking_success': calculate_rebooking_rates(shifts, cancels, bookings),
        'shift_fulfillment': calculate_fulfillment_rates(shifts, bookings)
    }


def calculate_worker_reliability(shifts, cancels):
    """
    Calculate reliability scores for workers based on their history
    
    Parameters:
    -----------
    shifts : pd.DataFrame
        Shift data with worker information
    cancels : pd.DataFrame
        Cancellation data
        
    Returns:
    -----------
    pd.DataFrame
        Worker reliability metrics
    """
    worker_metrics = pd.DataFrame()
    
    # Only analyze workers with valid IDs
    valid_workers = shifts[shifts['Agent ID'].notnull()]
    
    # Calculate basic metrics per worker
    worker_metrics = valid_workers.groupby('Agent ID').agg({
        'ID': 'count',  # Total shifts
        'Verified': 'mean',  # Verification rate
        'Charge': 'mean'  # Average charge rate
    }).rename(columns={
        'ID': 'total_shifts',
        'Verified': 'verification_rate',
        'Charge': 'avg_charge'
    })
    
    # Add cancellation metrics
    cancellation_rates = (
        cancels.groupby('Worker ID')
        .agg({
            'Shift ID': 'count',
            'Action': lambda x: (x == 'NO_CALL_NO_SHOW').mean()
        })
        .rename(columns={
            'Shift ID': 'cancellations',
            'Action': 'ncns_rate'
        })
    )
    
    worker_metrics = worker_metrics.join(
        cancellation_rates, 
        how='left'
    ).fillna(0)
    
    # Calculate reliability score (you can adjust the formula)
    worker_metrics['reliability_score'] = (
        worker_metrics['verification_rate'] * 0.4 +
        (1 - worker_metrics['ncns_rate']) * 0.4 +
        (1 - worker_metrics['cancellations']/worker_metrics['total_shifts']) * 0.2
    )
    
    return worker_metrics.sort_values('reliability_score', ascending=False)

def analyze_cancel_timing(cancels):
    """
    Analyze cancellation timing patterns including day/hour distribution,
    lead times, and seasonal patterns.
    
    Parameters:
    -----------
    cancels : pd.DataFrame
        Cancellation data with datetime columns and lead times
        
    Returns:
    -----------
    dict : Dictionary containing timing analysis results
    """
    timing_analysis = {
        # Time of day patterns
        'hourly_distribution': cancels['Created At'].dt.hour.value_counts().sort_index(),
        'daily_distribution': cancels['Created At'].dt.day_name().value_counts(),
        
        # Lead time analysis
        'lead_time_stats': cancels['Lead Time'].describe(),
        'lead_time_buckets': pd.cut(
            cancels['Lead Time'],
            bins=[-float('inf'), 0, 4, 24, 72, float('inf')],
            labels=['After Start', 'Under 4hrs', '4-24hrs', '1-3 days', 'Over 3 days']
        ).value_counts().sort_index(),
        
        # Action type by timing
        'timing_by_action': pd.crosstab(
            pd.cut(cancels['Lead Time'], 
                  bins=[-float('inf'), 0, 4, 24, 72, float('inf')],
                  labels=['After Start', 'Under 4hrs', '4-24hrs', '1-3 days', 'Over 3 days']),
            cancels['Action']
        )
    }
    
    return timing_analysis

def calculate_rebooking_rates(shifts, cancels, bookings):
    """
    Calculate how successfully cancelled shifts get rebooked
    
    Parameters:
    -----------
    shifts : pd.DataFrame
        Shift data
    cancels : pd.DataFrame
        Cancellation data
    bookings : pd.DataFrame
        Booking data
        
    Returns:
    -----------
    dict : Dictionary containing rebooking analysis
    """
    # Get cancelled shifts
    cancelled_shifts = cancels['Shift ID'].unique()
    
    # Look at subsequent bookings for cancelled shifts
    rebooking_analysis = {
        'total_cancellations': len(cancelled_shifts),
        'rebooked_count': sum(
            bookings['Shift ID'].isin(cancelled_shifts) &
            (bookings['Created At'] > cancels.groupby('Shift ID')['Created At'].first())
        ),
        'rebooking_lead_times': bookings[
            bookings['Shift ID'].isin(cancelled_shifts)
        ]['Lead Time'].describe(),
        
        # Facility level analysis
        'facility_rebooking_rates': pd.DataFrame({
            'cancellations': cancels.groupby('Facility ID').size(),
            'rebookings': bookings[
                bookings['Shift ID'].isin(cancelled_shifts)
            ].groupby('Facility ID').size()
        }).fillna(0)
    }
    
    # Calculate success rate
    rebooking_analysis['overall_rebooking_rate'] = (
        rebooking_analysis['rebooked_count'] / 
        rebooking_analysis['total_cancellations']
    )
    
    return rebooking_analysis

def calculate_fulfillment_rates(shifts, bookings):
    """
    Calculate shift fulfillment rates and patterns
    
    Parameters:
    -----------
    shifts : pd.DataFrame
        Shift data including verification status
    bookings : pd.DataFrame
        Booking data
        
    Returns:
    -----------
    dict : Dictionary containing fulfillment analysis
    """
    fulfillment_analysis = {
        # Overall fulfillment
        'total_shifts': len(shifts),
        'booked_shifts': len(shifts[shifts['Agent ID'].notnull()]),
        'verified_shifts': shifts['Verified'].sum(),
        
        # Fulfillment by type
        'fulfillment_by_type': pd.DataFrame({
            'total': shifts.groupby('Shift Type').size(),
            'booked': shifts[shifts['Agent ID'].notnull()].groupby('Shift Type').size(),
            'verified': shifts[shifts['Verified']].groupby('Shift Type').size()
        }).fillna(0),
        
        # Fulfillment by role
        'fulfillment_by_role': pd.DataFrame({
            'total': shifts.groupby('Agent Req').size(),
            'booked': shifts[shifts['Agent ID'].notnull()].groupby('Agent Req').size(),
            'verified': shifts[shifts['Verified']].groupby('Agent Req').size()
        }).fillna(0)
    }
    
    # Calculate rates
    fulfillment_analysis['overall_booking_rate'] = (
        fulfillment_analysis['booked_shifts'] / 
        fulfillment_analysis['total_shifts']
    )
    fulfillment_analysis['overall_verification_rate'] = (
        fulfillment_analysis['verified_shifts'] / 
        fulfillment_analysis['total_shifts']
    )
    
    return fulfillment_analysis

In [None]:
# Run the analysis
worker_analysis = analyze_worker_patterns(
    overlap_shifts,
    overlap_cancels,
    overlap_bookings
)

# Display results
print("\n=== Worker Analysis ===")
print("\nMetrics Summary:")
print(worker_analysis['success_metrics'].describe())

print("\nTop 5 Most Reliable Workers:")
print(worker_analysis['success_metrics'].nlargest(5, 'reliability')[
    ['total_shifts', 'cancellations', 'reliability', 'completion_rate']
])

In [None]:
# Example usage
overlap_start = shifts_df['Start'].dt.date.min()
overlap_end = shifts_df['Start'].dt.date.max()

# Filter data to overlapping period
overlap_shifts = shifts_df[shifts_df['Start'].dt.date.between(overlap_start, overlap_end)]
overlap_cancels = cancellations_df[
    cancellations_df['Created At'].dt.date.between(overlap_start, overlap_end)
]
overlap_bookings = bookings_df[
    bookings_df['Created At'].dt.date.between(overlap_start, overlap_end)
]

# Run the analysis
worker_analysis = analyze_worker_patterns(
    overlap_shifts,
    overlap_cancels,
    overlap_bookings
)

# Display summary results
print("\n=== Worker Pattern Analysis ===")
print("\nTop Performing Workers:")
print(worker_analysis['success_metrics'].nlargest(5, 'overall_score'))

print("\nBooking Time Preferences (Top 3 Hours):")
print(worker_analysis['booking_patterns']['time_preferences']['booking_hours']
      .groupby(level=0).nlargest(3))

Let's think critically about this:

Understanding the Data Context
From the proposal:


They explicitly say booking data is a subset
They state it's "OK" because it's meant to show booking behavior patterns
However, they don't mention if cancellation data is complete or a subset
The shifts data appears to be the "source of truth" (Oct 2021 - Jan 2022)


Analysis Implications
We should split our analysis into two categories:

A. Full Period Analysis (Using Shifts Data)

In [None]:
"""Lets think critically about this:

Understanding the Data Context
From the proposal:


They explicitly say booking data is a subset
They state it's "OK" because it's meant to show booking behavior patterns
However, they don't mention if cancellation data is complete or a subset
The shifts data appears to be the "source of truth" (Oct 2021 - Jan 2022)


Analysis Implications
We should split our analysis into two categories: """
# A. Full Period Analysis (Using Shifts Data)
#  B: Behavioral Analysis (Using Overlap Period)

"""The key insight is that we should:

Use shifts data for absolute metrics
Use overlap periods for behavioral analysis
Be clear about limitations in our findings
Focus on patterns rather than absolute numbers for booking/cancellation behavior

This matches their intent while making the best use of available data."""

In [None]:


# A. Full Period Analysis (Using Shifts Data)
def analyze_shifts_complete():
    """
    Analyze the complete shifts dataset for overall marketplace health
    
    Note: This analysis uses only the shifts dataset which appears to be 
    complete for Oct 2021 - Jan 2022.
    """
    shifts_analysis = {
        # Basic marketplace metrics
        'total_shifts': len(shifts_df),
        'shifts_by_type': shifts_df['Shift Type'].value_counts(),
        'verification_rate': shifts_df['Verified'].mean(),
        
        # Financial metrics
        'charge_patterns': shifts_df.groupby('Agent Req')['Charge'].describe(),
        
        # Time patterns
        'shift_distribution': {
            'by_day': shifts_df['Start'].dt.day_name().value_counts(),
            'by_hour': shifts_df['Start'].dt.hour.value_counts().sort_index()
        },
        
        # Facility metrics
        'facility_patterns': shifts_df.groupby('Facility ID').agg({
            'ID': 'count',
            'Verified': 'mean',
            'Charge': 'mean'
        }).rename(columns={'ID': 'total_shifts'})
    }
    return shifts_analysis

In [None]:
#  B: Behavioral Analysis (Using Overlap Period)
def analyze_booking_behavior(start_date=None, end_date=None):
    """
    Analyze HCP booking and cancellation behavior where we have all datasets
    
    Notes:
    - This analysis uses the period where we have overlapping data
    - Focus is on understanding behavioral patterns rather than absolute numbers
    """
    # Filter to overlap period if dates provided
    if start_date and end_date:
        shifts_subset = shifts_df[shifts_df['Start'].dt.date.between(start_date, end_date)]
        cancels_subset = cancellations_df[
            cancellations_df['Created At'].dt.date.between(start_date, end_date)
        ]
        bookings_subset = bookings_df[
            bookings_df['Created At'].dt.date.between(start_date, end_date)
        ]
    else:
        shifts_subset = shifts_df
        cancels_subset = cancellations_df
        bookings_subset = bookings_df
    
    # Cross-reference data
    shifts_with_outcomes = shifts_subset.copy()
    shifts_with_outcomes['was_booked'] = shifts_subset['ID'].isin(bookings_subset['Shift ID'])
    shifts_with_outcomes['was_cancelled'] = shifts_subset['ID'].isin(cancels_subset['Shift ID'])
    
    behavior_analysis = {
        # Booking patterns
        'booking_behavior': {
            'lead_times': bookings_subset['Lead Time'].describe(),
            'booking_times': bookings_subset['Created At'].dt.hour.value_counts().sort_index()
        },
        
        # Cancellation patterns
        'cancellation_behavior': {
            'cancel_types': cancels_subset['Action'].value_counts(),
            'lead_times': cancels_subset['Lead Time'].describe(),
            'cancel_times': cancels_subset['Created At'].dt.hour.value_counts().sort_index()
        },
        
        # Shift outcomes
        'shift_outcomes': {
            'total_shifts': len(shifts_subset),
            'booked_count': shifts_with_outcomes['was_booked'].sum(),
            'cancelled_count': shifts_with_outcomes['was_cancelled'].sum(),
            'booking_rate': shifts_with_outcomes['was_booked'].mean(),
            'cancellation_rate': shifts_with_outcomes['was_cancelled'].mean()
        }
    }
    return behavior_analysis

In [None]:
# For Missing Agent IDs Let's cross-check with both datasets:


def analyze_missing_agents_behavior():
    """
    Analyze what happens to shifts with missing Agent IDs
    """
    # Get shifts with/without agents
    missing_agent = shifts_df[shifts_df['Agent ID'].isnull()]
    has_agent = shifts_df[shifts_df['Agent ID'].notnull()]
    
    # Cross reference with bookings and cancellations
    missing_outcomes = {
        'booked': missing_agent['ID'].isin(bookings_df['Shift ID']).mean(),
        'cancelled': missing_agent['ID'].isin(cancellations_df['Shift ID']).mean(),
        'verified': missing_agent['Verified'].mean()
    }
    
    has_agent_outcomes = {
        'booked': has_agent['ID'].isin(bookings_df['Shift ID']).mean(),
        'cancelled': has_agent['ID'].isin(cancellations_df['Shift ID']).mean(),
        'verified': has_agent['Verified'].mean()
    }
    
    return {
        'missing_agent_outcomes': missing_outcomes,
        'has_agent_outcomes': has_agent_outcomes
    }



In [None]:
# Run all analyses
print("Running comprehensive analyses...")

# 1. Full Shifts Analysis
print("\n=== COMPLETE SHIFTS ANALYSIS (Oct 2021 - Jan 2022) ===")
shifts_analysis = analyze_shifts_complete()
print("\nBasic Marketplace Metrics:")
print(f"Total Shifts: {shifts_analysis['total_shifts']:,}")
print("\nShift Types:")
print(shifts_analysis['shifts_by_type'])
print(f"\nOverall Verification Rate: {shifts_analysis['verification_rate']:.2%}")

# 2. Behavioral Analysis 
# Using the overlap period (focusing on patterns rather than absolute numbers)
print("\n=== BEHAVIORAL ANALYSIS (Overlap Period) ===")
start_date = shifts_df['Start'].dt.date.min()  # Oct 1, 2021
end_date = shifts_df['Start'].dt.date.max()    # Jan 31, 2022
behavior_analysis = analyze_booking_behavior(start_date, end_date)

print("\nBooking Patterns:")
print("Lead Times (hours):")
print(behavior_analysis['booking_behavior']['lead_times'])

print("\nCancellation Types:")
print(behavior_analysis['cancellation_behavior']['cancel_types'])

print("\nShift Outcomes:")
for metric, value in behavior_analysis['shift_outcomes'].items():
    if 'rate' in metric:
        print(f"{metric}: {value:.2%}")
    else:
        print(f"{metric}: {value:,}")

# 3. Missing Agent ID Analysis
print("\n=== MISSING AGENT ID ANALYSIS ===")
agent_behavior = analyze_missing_agents_behavior()

print("\nShifts with Missing Agent IDs:")
for metric, value in agent_behavior['missing_agent_outcomes'].items():
    print(f"{metric}: {value:.2%}")

print("\nShifts with Agent IDs:")
for metric, value in agent_behavior['has_agent_outcomes'].items():
    print(f"{metric}: {value:.2%}")

# Store results in summary object for later use
summary.add_summary('complete_analysis', 'shifts', shifts_analysis)
summary.add_summary('complete_analysis', 'behavior', behavior_analysis)
summary.add_summary('complete_analysis', 'missing_agents', agent_behavior)


"""
This code:

Analyzes the complete shifts dataset first
Looks at booking/cancellation behavior in the overlap period
Specifically examines shifts with/without Agent IDs
Stores all results in our summary object

The output will help us understand:

Overall marketplace metrics from shifts data
Behavioral patterns where we have complete data
What missing Agent IDs might mean

Each section is clearly labeled, and results are formatted for easy reading. We can use these results to:

Identify key patterns
Support our findings
Guide additional analysis
"""


In [None]:
# PHASE 1: SUCCESS PATH ANALYSIS
#A. Define and Validate Success Metrics
#Deep dive into successful shifts 

class ShiftSuccessAnalysis:
    """
    Analyzes the complete lifecycle of shifts from posting to completion.
    
    Core metrics tracked:
    - Booking success: Did the shift get booked?
    - Retention success: Did the booking stick (no cancellation)?
    - Completion success: Was the shift verified as worked?
    """
    
    def __init__(self, shifts_df, bookings_df, cancellations_df):
        """Initialize with our three core datasets."""
        self.shifts_df = shifts_df.copy()
        self.bookings_df = bookings_df.copy()
        self.cancellations_df = cancellations_df.copy()
        self.success_journey = None
        
        # Verify data compatibility
        self._validate_data()
        
        # Create enhanced dataset
        self._create_success_journey()
    
    def _validate_data(self):
        """
        Ensure data quality and compatibility across datasets.
        """
        # Check required columns
        required_columns = {
            'shifts': ['ID', 'Start', 'End', 'Verified', 'Agent ID', 
                      'Facility ID', 'Agent Req', 'Shift Type', 'Charge'],
            'bookings': ['Shift ID', 'Created At', 'Worker ID'],
            'cancellations': ['Shift ID', 'Created At', 'Action', 'Lead Time']
        }
        
        for df_name, columns in required_columns.items():
            df = getattr(self, f"{df_name}_df")
            missing_cols = [col for col in columns if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in {df_name}: {missing_cols}")
        
        # Print coverage analysis
        self._analyze_coverage()
    
    def _analyze_coverage(self):
        """Analyze data coverage and overlap."""
        shifts_ids = set(self.shifts_df['ID'])
        booking_ids = set(self.bookings_df['Shift ID'])
        cancel_ids = set(self.cancellations_df['Shift ID'])
        
        print("\n=== Data Coverage Analysis ===")
        print(f"\nTotal Shifts: {len(shifts_ids):,}")
        print(f"Shifts with Bookings: {len(shifts_ids & booking_ids):,} "
              f"({len(shifts_ids & booking_ids)/len(shifts_ids):.1%})")
        print(f"Shifts with Cancellations: {len(shifts_ids & cancel_ids):,} "
              f"({len(shifts_ids & cancel_ids)/len(shifts_ids):.1%})")
        
        # Analyze potential data quality issues
        orphaned_bookings = len(booking_ids - shifts_ids)
        orphaned_cancels = len(cancel_ids - shifts_ids)
        
        if orphaned_bookings or orphaned_cancels:
            print("\nPotential Data Quality Issues:")
            print(f"Orphaned Bookings: {orphaned_bookings:,}")
            print(f"Orphaned Cancellations: {orphaned_cancels:,}")
    
    def _create_success_journey(self):
        """
        Creates enhanced dataset tracking complete shift lifecycle.
        """
        journey = self.shifts_df.copy()
        
        # Add booking information
        booking_times = self.bookings_df.groupby('Shift ID').agg({
            'Created At': ['first', 'count']
        }).reset_index()
        booking_times.columns = ['Shift ID', 'First Booking', 'Booking Count']
        
        journey = journey.merge(
            booking_times, 
            left_on='ID', 
            right_on='Shift ID', 
            how='left'
        )
        
        # Add cancellation information
        cancel_info = self.cancellations_df.groupby('Shift ID').agg({
            'Created At': 'first',
            'Action': 'first',
            'Lead Time': 'first'
        }).reset_index()
        
        journey = journey.merge(
            cancel_info,
            left_on='ID',
            right_on='Shift ID',
            how='left',
            suffixes=('_booking', '_cancel')
        )
        
        # Calculate success metrics
        journey['was_booked'] = journey['First Booking'].notnull()
        journey['was_cancelled'] = journey['Created At_cancel'].notnull()
        journey['was_completed'] = journey['Verified']
        
        # Calculate time to shift start (from booking)
        journey['lead_time'] = (
            journey['Start'] - journey['First Booking']
        ).dt.total_seconds() / 3600  # Convert to hours
        
        self.success_journey = journey
        
        # Print initial success metrics
        self._print_success_metrics()
    
    def _print_success_metrics(self):
        """Print key success metrics from the journey data."""
        metrics = self.success_journey.agg({
            'was_booked': 'mean',
            'was_cancelled': 'mean',
            'was_completed': 'mean'
        })
        
        print("\n=== Success Metrics ===")
        print(f"Booking Rate: {metrics['was_booked']:.1%}")
        print(f"Cancellation Rate: {metrics['was_cancelled']:.1%}")
        print(f"Completion Rate: {metrics['was_completed']:.1%}")
    
    def analyze_verification_discrepancy(self):
        """
        Investigates why shifts might be verified without appearing in booking logs.
        """
        verified_shifts = self.success_journey[self.success_journey['Verified']]
        unbooked_verified = verified_shifts[~verified_shifts['was_booked']]
        
        results = {
            'overview': {
                'total_shifts': len(self.success_journey),
                'verified_shifts': len(verified_shifts),
                'unbooked_verified': len(unbooked_verified),
                'verification_rate': len(verified_shifts) / len(self.success_journey),
                'unbooked_verified_rate': len(unbooked_verified) / len(verified_shifts)
            },
            'unbooked_verified_patterns': {
                'by_role': unbooked_verified['Agent Req'].value_counts(),
                'by_shift_type': unbooked_verified['Shift Type'].value_counts(),
                'by_facility': unbooked_verified['Facility ID'].value_counts().head()
            }
        }
        
        agent_patterns = self.success_journey.groupby(
            self.success_journey['Agent ID'].isnull()
        ).agg({
            'was_booked': 'mean',
            'Verified': 'mean',
            'was_cancelled': 'mean'
        }).round(3)
        
        results['agent_id_patterns'] = agent_patterns
        
        return results
    
    def get_success_patterns(self):
        """
        Analyzes patterns in successfully completed shifts.
        """
        successful = self.success_journey[
            (self.success_journey['Verified']) & 
            (~self.success_journey['was_cancelled'])
        ]
        
        patterns = {
            'timing': {
                'hour_distribution': successful['Start'].dt.hour.value_counts().sort_index(),
                'day_distribution': successful['Start'].dt.day_name().value_counts(),
                'lead_times': successful['lead_time'].describe()
            },
            'characteristics': {
                'role_distribution': successful['Agent Req'].value_counts(),
                'shift_types': successful['Shift Type'].value_counts(),
                'charge_rates': successful.groupby('Agent Req')['Charge'].agg(['mean', 'std'])
            },
            'facility_patterns': {
                'success_rates': (
                    self.success_journey.groupby('Facility ID')['Verified'].agg(['mean', 'count'])
                    .sort_values('mean', ascending=False)
                    .query('count >= 10')  # Only facilities with sufficient data
                )
            }
        }
        
        return patterns

In [None]:
# Run the analysis
analyzer = ShiftSuccessAnalysis(shifts_df, bookings_df, cancellations_df)

# Analyze verification discrepancy
discrepancy_results = analyzer.analyze_verification_discrepancy()

print("\n=== Verification Discrepancy Analysis ===")
print("\nOverview:")
for metric, value in discrepancy_results['overview'].items():
    if 'rate' in metric:
        print(f"{metric}: {value:.1%}")
    else:
        print(f"{metric}: {value:,}")

print("\nUnbooked Verified Shifts by Role:")
print(discrepancy_results['unbooked_verified_patterns']['by_role'])

print("\nAgent ID Impact:")
print(discrepancy_results['agent_id_patterns'])

# Get success patterns
success_patterns = analyzer.get_success_patterns()

print("\n=== Success Patterns ===")
print("\nMost Successful Shift Types:")
print(success_patterns['characteristics']['shift_types'])

print("\nAverage Charge Rates for Successful Shifts:")
print(success_patterns['characteristics']['charge_rates'])

print("\nTop 5 Facilities by Success Rate (min 10 shifts):")
print(success_patterns['facility_patterns']['success_rates'].head())

# Choosing a path: backup pool 

In [None]:
# 1. Backup Pool Estimation
# The goal is to calculate the size of the backup pool needed to cover 75% of late cancellations.
# Backup Pool Estimation

# Assumptions
BACKUP_COVERAGE_TARGET = 0.75  # Cover 75% of late cancellations

def estimate_backup_pool(shifts_df, cancellations_df):
    """
    Estimate the size of a backup pool needed to cover late cancellations.
    
    Parameters:
    - shifts_df (pd.DataFrame): Shift data.
    - cancellations_df (pd.DataFrame): Cancellations data with lead times.
    
    Returns:
    - Estimated pool size needed for target coverage.
    - Contextual insights into late cancellations.
    """
    # Step 1: Focus on Late Cancellations (<4 hours)
    late_cancellations = cancellations_df[
        cancellations_df['Lead Time'] < 4
    ]
    total_late_cancels = len(late_cancellations)
    
    print("=== Backup Pool Estimation for Late Cancellations ===")
    print(f"Total Late Cancellations (<4hrs): {total_late_cancels:,}")
    
    # Step 2: Estimate coverage required
    target_coverage = int(total_late_cancels * BACKUP_COVERAGE_TARGET)
    print(f"Target Coverage (75%): {target_coverage:,} shifts")

    # Step 3: Calculate HCP Availability and Estimate Pool Size
    late_cancel_hcps = late_cancellations['Worker ID'].value_counts()
    avg_shifts_per_hcp = late_cancel_hcps.mean()
    
    if avg_shifts_per_hcp > 0:
        pool_size = int(np.ceil(target_coverage / avg_shifts_per_hcp))
    else:
        pool_size = 0
    
    print(f"Average Late Cancellations per HCP: {avg_shifts_per_hcp:.2f}")
    print(f"Estimated Backup Pool Size: {pool_size} HCPs")
    
    print("\nContext:")
    print("To meet 75% coverage of late cancellations, we estimate needing a pool of pre-vetted,")
    print(f"reliable HCPs who can cover approximately {avg_shifts_per_hcp:.2f} late cancellations on average.")
    print("This estimate assumes that reliable HCPs are distributed evenly across cancellations.")
    
    return pool_size

def identify_reliable_hcps(bookings_df, cancellations_df, threshold=0.1):
    """
    Identify reliable HCPs with cancellation rates below a given threshold.
    
    Parameters:
    - bookings_df (pd.DataFrame): Booking logs with Worker IDs.
    - cancellations_df (pd.DataFrame): Cancellations data with Worker IDs.
    - threshold (float): Maximum cancellation rate for reliability.
    
    Returns:
    - Reliable HCPs as a DataFrame.
    """
    print("\n=== Reliable HCP Identification ===")
    
    # Step 1: Calculate Total Shifts and Cancellations per Worker
    total_shifts = bookings_df['Worker ID'].value_counts()
    total_cancellations = cancellations_df['Worker ID'].value_counts()
    
    # Step 2: Calculate Cancellation Rate
    reliability_df = pd.DataFrame({
        'Total Shifts': total_shifts,
        'Cancellations': total_cancellations
    }).fillna(0)
    reliability_df['Cancellation Rate'] = reliability_df['Cancellations'] / reliability_df['Total Shifts']
    
    # Step 3: Identify Reliable Workers
    reliable_hcps = reliability_df[reliability_df['Cancellation Rate'] <= threshold]
    reliable_hcps_sorted = reliable_hcps.sort_values(by='Cancellation Rate')
    
    print(f"Total Workers Analyzed: {len(reliability_df):,}")
    print(f"Workers with Cancellation Rate ≤ {threshold*100:.0f}%: {len(reliable_hcps):,}")
    print("\nTop 5 Most Reliable Workers:")
    print(reliable_hcps_sorted.head())
    
    print("\nContext:")
    print("Reliable HCPs are defined as those with a cancellation rate ≤ 10%.")
    print("This pool represents our most dependable workers, making them ideal candidates")
    print("for participation in the backup program. They are prioritized based on:")
    print("1. Total shifts worked.")
    print("2. Low cancellation counts.")
    
    return reliable_hcps_sorted

# Run the analyses
backup_pool_size = estimate_backup_pool(shifts_df, cancellations_df)
reliable_hcps = identify_reliable_hcps(bookings_df, cancellations_df)

# Display final summary
print("\n=== Summary for WBD ===")
print(f"Estimated Backup Pool Size (75% Late Cancel Coverage): {backup_pool_size} HCPs")
print(f"Reliable HCPs Identified (Cancellation Rate ≤ 10%): {len(reliable_hcps)} workers")




In [None]:
explore_summary(summary)