In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

class DataCleaner:
    @staticmethod
    def load_and_clean_shifts(df):
        """Load and clean shifts dataset"""
        df = df.copy()
        datetime_cols = ['Start', 'End', 'Created At']
        for col in datetime_cols:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], format='mixed')
        return df

    @staticmethod
    def load_and_clean_bookings(df):
        """Load and clean booking logs dataset"""
        df = df.copy()
        df['Created At'] = pd.to_datetime(df['Created At'])
        return df

    @staticmethod
    def load_and_clean_cancellations(df):
        """Load and clean cancellation logs dataset"""
        df = df.copy()
        df['Created At'] = pd.to_datetime(df['Created At'])
        df['Shift Start Logs'] = pd.to_datetime(df['Shift Start Logs'])
        return df

    @staticmethod
    def categorize_lead_time(hours):
        """Categorize lead times based on business rules"""
        if hours < 0:
            return 'No-Show'
        elif hours < 4:
            return 'Late (<4hrs)'
        elif hours < 24:
            return 'Same Day'
        elif hours < 72:
            return 'Advance (<3 days)'
        return 'Early (3+ days)'

    @staticmethod
    def clean_lead_times(df):
        """Clean and categorize lead times in cancellation data"""
        df = df.copy()
        quality_stats = {
            'original_rows': len(df),
            'null_lead_times': df['Lead Time'].isnull().sum(),
            'infinite_values': (~np.isfinite(df['Lead Time'])).sum()
        }
        
        mask = df['Lead Time'].notnull() & np.isfinite(df['Lead Time'])
        df = df[mask]
        df['clean_lead_time'] = df['Lead Time']
        df['cancellation_category'] = df['clean_lead_time'].apply(DataCleaner.categorize_lead_time)
        
        df['is_extreme_negative'] = df['Lead Time'] < -72
        df['is_extreme_positive'] = df['Lead Time'] > 1000
        
        quality_stats['final_rows'] = len(df)
        quality_stats['removed_rows'] = quality_stats['original_rows'] - quality_stats['final_rows']
        
        return df, pd.Series(quality_stats)

class DataAnalyzer:
    def __init__(self):
        self.summaries = {}

    def analyze_cancellation_patterns(self, clean_cancellations, shifts_df):
        """Analyze patterns in cancellations"""
        action_counts = clean_cancellations['Action'].value_counts()
        
        cancellations_with_shifts = pd.merge(
            clean_cancellations,
            shifts_df[['ID', 'Agent Req', 'Shift Type', 'Charge']],
            left_on='Shift ID',
            right_on='ID',
            how='left'
        )
        
        role_cancels = pd.crosstab(
            cancellations_with_shifts['Agent Req'],
            cancellations_with_shifts['cancellation_category'],
            normalize='index'
        ).round(3) * 100
        
        shift_cancels = pd.crosstab(
            cancellations_with_shifts['Shift Type'],
            cancellations_with_shifts['cancellation_category'],
            normalize='index'
        ).round(3) * 100
        
        self.summaries['cancellations'] = {
            'action_types': action_counts.to_dict(),
            'role_patterns': role_cancels.to_dict(),
            'shift_patterns': shift_cancels.to_dict()
        }
        
        return cancellations_with_shifts

    def analyze_booking_patterns(self, bookings_df, shifts_df, clean_cancellations):
        """Analyze patterns in shift bookings"""
        bookings_with_shifts = pd.merge(
            bookings_df,
            shifts_df[['ID', 'Created At', 'Agent Req', 'Shift Type', 'Charge']],
            left_on='Shift ID',
            right_on='ID',
            how='left',
            suffixes=('_booking', '_shift')
        )
        
        bookings_with_shifts['time_to_fill'] = (
            pd.to_datetime(bookings_with_shifts['Created At_booking']) - 
            pd.to_datetime(bookings_with_shifts['Created At_shift'])
        ).dt.total_seconds() / 3600
        
        role_bookings = bookings_with_shifts.groupby('Agent Req').agg({
            'Shift ID': 'count',
            'time_to_fill': 'mean',
            'Charge': 'mean'
        }).round(2)
        
        self.summaries['bookings'] = {
            'time_to_fill': bookings_with_shifts['time_to_fill'].describe().to_dict(),
            'role_patterns': role_bookings.to_dict()
        }
        
        return bookings_with_shifts

    def analyze_economic_impact(self, shifts_df, cancellations_with_shifts):
        """Analyze economic impact of cancellations"""
        if 'Time' not in cancellations_with_shifts.columns:
            cancellations_with_shifts = pd.merge(
                cancellations_with_shifts,
                shifts_df[['ID', 'Time', 'Charge']],
                left_on='Shift ID',
                right_on='ID',
                how='left'
            )

        total_revenue = (shifts_df['Charge'] * shifts_df['Time']).sum()
        cancelled_revenue = (cancellations_with_shifts['Charge'] * 
                           cancellations_with_shifts['Time']).sum()
        
        role_impact = cancellations_with_shifts.groupby('Agent Req').agg({
            'Shift ID': 'count',
            'Charge': ['mean', 'sum'],
            'Time': 'sum'
        }).round(2)
        
        self.summaries['economic'] = {
            'total_revenue': total_revenue,
            'cancelled_revenue': cancelled_revenue,
            'role_impact': role_impact.to_dict()
        }
        
        return role_impact

def main():
    # Initialize classes
    cleaner = DataCleaner()
    analyzer = DataAnalyzer()
    
    # Load data
    shifts_df = pd.read_csv('data/cleveland_shifts_large.csv')
    bookings_df = pd.read_csv('data/booking_logs_large.csv')
    cancellations_df = pd.read_csv('data/cancel_logs_large.csv')
    
    # Clean data
    shifts_df = cleaner.load_and_clean_shifts(shifts_df)
    bookings_df = cleaner.load_and_clean_bookings(bookings_df)
    cancellations_df = cleaner.load_and_clean_cancellations(cancellations_df)
    clean_cancellations, quality_stats = cleaner.clean_lead_times(cancellations_df)
    
    # Run analyses
    cancellations_with_shifts = analyzer.analyze_cancellation_patterns(
        clean_cancellations, shifts_df)
    bookings_with_shifts = analyzer.analyze_booking_patterns(
        bookings_df, shifts_df, clean_cancellations)
    role_impact = analyzer.analyze_economic_impact(
        shifts_df, cancellations_with_shifts)
    
    return analyzer.summaries

if __name__ == "__main__":
    main()