In [None]:
#check the kernel
import sys
print(sys.executable)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print("Libraries are properly installed!")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def load_and_clean_bookings(df):
    """Load and clean booking logs dataset"""
    df = df.copy()
    # Convert datetime columns
    df['Created At'] = pd.to_datetime(df['Created At'])
    # Add any specific cleaning steps
    return df

def load_and_clean_cancellations(df):
    """Load and clean cancellation logs dataset"""
    df = df.copy()
    # Convert datetime columns
    df['Created At'] = pd.to_datetime(df['Created At'])
    df['Shift Start Logs'] = pd.to_datetime(df['Shift Start Logs'])
    # Add any specific cleaning steps
    return df

def load_and_clean_shifts(df):
    """Load and clean shifts dataset"""
    df = df.copy()
    # Convert datetime columns
    df['Start'] = pd.to_datetime(df['Start'])
    df['End'] = pd.to_datetime(df['End'])
    df['Created At'] = pd.to_datetime(df['Created At'])
    # Add any specific cleaning steps
    return df

class DataSummary:
    """Class to store and manage analysis results"""
    def __init__(self):
        self.summaries = {}
    
    def add_summary(self, dataset_name, summary_type, data):
        """Add summary statistics to storage"""
        if dataset_name not in self.summaries:
            self.summaries[dataset_name] = {}
        self.summaries[dataset_name][summary_type] = data
    
    def get_summary(self, dataset_name, summary_type=None):
        """Retrieve stored summary statistics"""
        if summary_type:
            return self.summaries.get(dataset_name, {}).get(summary_type)
        return self.summaries.get(dataset_name)
    
    def print_summary(self, dataset_name):
        """Print stored summaries for a dataset"""
        if dataset_name in self.summaries:
            print(f"\nSummary for {dataset_name}:")
            for summary_type, data in self.summaries[dataset_name].items():
                print(f"\n{summary_type}:")
                print(data)

# Initialize summary storage
summary = DataSummary()

In [None]:
import os
print("Current working directory:", os.getcwd())
print("\nFiles in data directory:", os.listdir('data'))

def load_and_clean_shifts(df):
    """Load and clean shifts dataset"""
    df = df.copy()
    
    # Convert datetime columns with flexible parsing
    for col in ['Start', 'End', 'Created At']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='mixed')
    
    return df

# Load the shifts data
shifts_df = pd.read_csv('data/Cleveland_shifts_Sample_100_rows.csv')

# Clean the data using our modified helper function
shifts_df = load_and_clean_shifts(shifts_df)

# Get initial summary
print("\nDataset Shape:", shifts_df.shape)
print("\nColumns:", shifts_df.columns.tolist())
print("\nData Types:\n", shifts_df.dtypes)
print("\nMissing Values:\n", shifts_df.isnull().sum())

# Store summary in our DataSummary class
summary.add_summary('shifts', 'shape', shifts_df.shape)
summary.add_summary('shifts', 'dtypes', shifts_df.dtypes)
summary.add_summary('shifts', 'missing_values', shifts_df.isnull().sum())

# Display first few rows
print("\nFirst few rows:")
print(shifts_df.head())

In [None]:
# Basic statistics for numerical columns
numeric_stats = shifts_df[['Charge', 'Time']].describe()
print("\nNumerical Statistics:")
print(numeric_stats)

# Value counts for categorical columns
print("\nShift Type Distribution:")
print(shifts_df['Shift Type'].value_counts(dropna=True))

print("\nAgent Requirement Distribution:")
print(shifts_df['Agent Req'].value_counts(dropna=True))

# Check for data completeness pattern
complete_rows = shifts_df.dropna().shape[0]
print(f"\nComplete rows: {complete_rows} out of {shifts_df.shape[0]}")

# Time-based patterns
shifts_df['Hour'] = shifts_df['Start'].dt.hour
shifts_df['Day'] = shifts_df['Start'].dt.day_name()

print("\nShifts by hour of day:")
print(shifts_df['Hour'].value_counts().sort_index())

print("\nShifts by day of week:")
print(shifts_df['Day'].value_counts())

# Store these results
summary.add_summary('shifts', 'numeric_stats', numeric_stats)
summary.add_summary('shifts', 'shift_types', shifts_df['Shift Type'].value_counts(dropna=True).to_dict())
summary.add_summary('shifts', 'agent_types', shifts_df['Agent Req'].value_counts(dropna=True).to_dict())