In [None]:
# === Configuration and Imports ===
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

class DataLoader:
    """Class to handle data loading and cleaning operations"""
    def __init__(self, data_dir='data'):
        self.data_dir = data_dir
        
    def load_file(self, filename):
        """Safely load CSV file with error handling"""
        try:
            filepath = os.path.join(self.data_dir, filename)
            return pd.read_csv(filepath)
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
            return None
    
    def load_all_datasets(self):
        """Load all required datasets"""
        datasets = {
            'shifts': 'cleveland_shifts_large.csv',
            'bookings': 'booking_logs_large.csv',
            'cancellations': 'cancel_logs_large.csv'
        }
        
        return {key: self.load_file(filename) 
                for key, filename in datasets.items()}

class DataCleaner:
    """Class to handle data cleaning operations"""
    @staticmethod
    def clean_shifts(df):
        if df is None:
            return None
            
        df = df.copy()
        datetime_cols = ['Start', 'End', 'Created At']
        
        for col in datetime_cols:
            try:
                df[col] = pd.to_datetime(df[col], format='mixed')
            except Exception as e:
                print(f"Error converting {col}: {str(e)}")
                df[col] = pd.to_datetime(df[col], errors='coerce')
                
        return df
    
    @staticmethod
    def clean_bookings(df):
        if df is None:
            return None
            
        df = df.copy()
        try:
            df['Created At'] = pd.to_datetime(df['Created At'])
        except Exception as e:
            print(f"Error cleaning bookings: {str(e)}")
            df['Created At'] = pd.to_datetime(df['Created At'], errors='coerce')
        
        return df
    
    @staticmethod
    def clean_cancellations(df):
        if df is None:
            return None
            
        df = df.copy()
        try:
            df['Created At'] = pd.to_datetime(df['Created At'])
            if 'Shift Start Logs' in df.columns:
                df['Shift Start Logs'] = pd.to_datetime(df['Shift Start Logs'])
        except Exception as e:
            print(f"Error cleaning cancellations: {str(e)}")
        
        return df
    
    @staticmethod
    def clean_lead_times(df):
        """Clean and categorize lead times"""
        if df is None:
            return None, None
            
        df = df.copy()
        
        # Track data quality
        quality_stats = {
            'original_rows': len(df),
            'null_lead_times': df['Lead Time'].isnull().sum(),
            'infinite_values': (~np.isfinite(df['Lead Time'])).sum()
        }
        
        # Remove invalid data
        mask = df['Lead Time'].notnull() & np.isfinite(df['Lead Time'])
        df = df[mask]
        
        # Add analysis columns
        df['clean_lead_time'] = df['Lead Time']
        df['cancellation_category'] = df['clean_lead_time'].apply(
            lambda x: DataCleaner.categorize_lead_time(x))
        
        # Flag extreme values
        df['is_extreme_negative'] = df['Lead Time'] < -72
        df['is_extreme_positive'] = df['Lead Time'] > 1000
        
        quality_stats['final_rows'] = len(df)
        quality_stats['removed_rows'] = quality_stats['original_rows'] - quality_stats['final_rows']
        
        return df, pd.Series(quality_stats)
    
    @staticmethod
    def categorize_lead_time(hours):
        """Categorize lead times based on business rules"""
        if hours < 0:
            return 'No-Show'
        elif hours < 4:
            return 'Late (<4hrs)'
        elif hours < 24:
            return 'Same Day'
        elif hours < 72:
            return 'Advance (<3 days)'
        return 'Early (3+ days)'

class DataAnalyzer:
    """Class to handle analysis operations"""
    def __init__(self, shifts_df, bookings_df, cancellations_df):
        self.shifts_df = shifts_df
        self.bookings_df = bookings_df
        self.cancellations_df = cancellations_df
        self.summary = DataSummary()
    
    def analyze_shifts(self):
        """Analyze shifts data"""
        if self.shifts_df is None:
            return None
            
        # Basic statistics
        numeric_stats = self.shifts_df[['Charge', 'Time']].describe()
        
        # Time-based analysis
        self.shifts_df['Hour'] = self.shifts_df['Start'].dt.hour
        self.shifts_df['Day'] = self.shifts_df['Start'].dt.day_name()
        self.shifts_df['Month'] = self.shifts_df['Start'].dt.month
        
        # Store results
        self.summary.add_summary('shifts', 'numeric_stats', numeric_stats)
        return numeric_stats
    
    def analyze_cancellations(self):
        """Analyze cancellation patterns"""
        if self.cancellations_df is None:
            return None
            
        clean_cancels, stats = DataCleaner.clean_lead_times(self.cancellations_df)
        if clean_cancels is None:
            return None
            
        # Analysis results
        results = {
            'total_cancellations': len(clean_cancels),
            'categories': clean_cancels['cancellation_category'].value_counts().to_dict(),
            'quality_stats': stats.to_dict()
        }
        
        self.summary.add_summary('cancellations', 'analysis', results)
        return results
    
    def analyze_economic_impact(self):
        """Analyze economic impact of cancellations"""
        if any(df is None for df in [self.shifts_df, self.cancellations_df]):
            return None
            
        # Calculate revenue metrics
        total_revenue = (self.shifts_df['Charge'] * self.shifts_df['Time']).sum()
        avg_rate = self.shifts_df['Charge'].mean()
        
        results = {
            'total_revenue': total_revenue,
            'average_rate': avg_rate
        }
        
        self.summary.add_summary('economic', 'impact', results)
        return results

class DataSummary:
    """Class to store and manage analysis results"""
    def __init__(self):
        self.summaries = {}
    
    def add_summary(self, dataset_name, summary_type, data):
        if dataset_name not in self.summaries:
            self.summaries[dataset_name] = {}
        self.summaries[dataset_name][summary_type] = data
    
    def get_summary(self, dataset_name, summary_type=None):
        if summary_type:
            return self.summaries.get(dataset_name, {}).get(summary_type)
        return self.summaries.get(dataset_name)
    
    def print_summary(self, dataset_name):
        if dataset_name in self.summaries:
            print(f"\nSummary for {dataset_name}:")
            for summary_type, data in self.summaries[dataset_name].items():
                print(f"\n{summary_type}:")
                print(data)

def main():
    """Main execution function"""
    # Initialize components
    loader = DataLoader()
    cleaner = DataCleaner()
    
    # Load data
    datasets = loader.load_all_datasets()
    
    # Clean data
    clean_datasets = {
        'shifts': cleaner.clean_shifts(datasets['shifts']),
        'bookings': cleaner.clean_bookings(datasets['bookings']),
        'cancellations': cleaner.clean_cancellations(datasets['cancellations'])
    }
    
    # Initialize analyzer
    analyzer = DataAnalyzer(**clean_datasets)
    
    # Run analyses
    shift_analysis = analyzer.analyze_shifts()
    cancellation_analysis = analyzer.analyze_cancellations()
    economic_analysis = analyzer.analyze_economic_impact()
    
    return analyzer.summary

if __name__ == "__main__":
    summary = main()