In [None]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import plotly.express as px
import re
import itertools
import warnings
from datetime import datetime, timedelta
from azure.identity import AzureCliCredential
from azure.storage.filedatalake import DataLakeFileClient, DataLakeServiceClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
warnings.filterwarnings("ignore")

In [None]:
def get_credential():
    # Haal het token op wat met az login is aangemaakt
    return AzureCliCredential()


def read_file(credential, storage_account, container,
                  filepath):
    account_url = "https://{}.dfs.core.windows.net".format(storage_account)

    file_client = DataLakeFileClient(account_url=account_url,
                                     file_system_name=container,
                                     file_path=filepath,
                                     credential=credential)

    downloaded_bytes = io.BytesIO(file_client.download_file().readall())
    return downloaded_bytes


def write_file(buffer, credential, storage_account, container,
                filepath):
    account_url = "https://{}.dfs.core.windows.net".format(storage_account)

    file_client = DataLakeFileClient(account_url=account_url,
                                     file_system_name=container,
                                     file_path=filepath,
                                     credential=credential)

    file_client.upload_data(buffer.getvalue(), overwrite=True)

In [None]:
# Haal het token op wat met az login is aangemaakt
credential = get_credential()

# Definieer storage-account en containername
storage_account_name = "prda007itweup01dapsts01"
container_name = "ait-analytics30-s"
account_url = "https://{}.dfs.core.windows.net".format(storage_account_name)
reports_folder = "426_rapportage/"
schedule_folder = "werkvoorraad_planning/"
schedule2_folder = "werkverdeling_acceptatie/"

In [None]:
# Read file
import os
import pyarrow.parquet as pq

def get_df(folder = "426_rapportage/", file_name = "426_werkvoorraad_MO_20230131_AMB.xlsx"):
    file_path = folder + '/' + file_name

    file_bytes = read_file(credential=credential,
                  storage_account=storage_account_name,
                  container=container_name,
                  filepath=file_path)
    
    return(file_bytes)

In [None]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_all_file_names(account_url, credential, container_name, folder):
    # Get a reference to the file system and directory
    service_client = DataLakeServiceClient(account_url=account_url, credential=credential)
    file_system_client = service_client.get_file_system_client(container_name)

    file_list = []
    # List files in the directory
    files = file_system_client.get_paths()
    for file in files:
        if ((file.name.startswith(folder)) & (".xlsx" in file.name)):
            file_list.append(file.name)
    return file_list

In [None]:
# Function to extract filename with the specific date
def extract_filename_with_date(filenames, date):
    for filename in filenames:
        if date in filename:
            return filename
    return None  # Return None if no matching filename is found

# Function to convert YYYYMMDD to DD-MM-YYYY
def convert_date_format(yyyymmdd):
    return f"{yyyymmdd[6:8]}-{yyyymmdd[4:6]}-{yyyymmdd[0:4]}"

# Function to extract filename with the specific date, considering the new format
def extract_filename_with_converted_date(filenames, date):
    for filename in filenames:
        # Use regular expressions to find the date in YYYYMMDD format within the filename
        match = re.search(r'\d{8}', filename)
        if match:
            date_in_file = match.group(0)  # Extract the date string
            converted_date = convert_date_format(date_in_file)  # Convert to DD-MM-YYYY
            if converted_date == date:
                return filename

## Load data into environment

In [None]:
def transform_schedule(df):
    weekly_schedule_df = df

    # Identifying team names and their respective employee columns
    team_columns = [col for col in weekly_schedule_df.columns if 'Team' in col]

    # Mapping each team to its members
    team_member_map = {}
    for i, team_col in enumerate(team_columns):
        team_idx = weekly_schedule_df.columns.get_loc(team_col)
        if i < len(team_columns) - 1:
            next_team_idx = weekly_schedule_df.columns.get_loc(team_columns[i + 1])
            members = weekly_schedule_df.columns[team_idx + 1:next_team_idx]
        else:
            members = weekly_schedule_df.columns[team_idx + 1:]

        team_member_map[weekly_schedule_df[team_col].iloc[0]] = members

    # Transforming the dataframe
    new_data = []
    for index, row in weekly_schedule_df.iterrows():
        for team_name, team_members in team_member_map.items():
            total_hours = row[team_members].apply(pd.to_numeric, errors='coerce').sum()
            new_data.append({
                'Datum': row['Datum'],
                'Dag': row['Dag'],
                'Team': team_name,
                'TotalHours': total_hours
            })

    # Create the new dataframe
    transformed_df = pd.DataFrame(new_data)

    # Display the first few rows of the new dataframe
    return(transformed_df)

In [None]:
#Get latest working schedule
schedule_file_names = get_all_file_names(account_url, credential, container_name, folder = schedule_folder)
schedule_file_names2 = get_all_file_names(account_url, credential, container_name, folder = schedule2_folder)

weekly_schedules = []
for i in schedule_file_names:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    # Check if the 'ROOSTER' sheet exists in the Excel file
    if 'ROOSTER' in pd.ExcelFile(excel_file).sheet_names:
        temp = pd.read_excel(excel_file, sheet_name='ROOSTER')
        temp['Datum'] = pd.to_datetime(temp['Datum'], errors='coerce')
        temp = temp.dropna(subset=['Datum'])
        temp = temp.dropna(axis=1, how='all')
        transformed_temp = transform_schedule(temp)
        weekly_schedules.append(transformed_temp)
    else:
        print(f"Skipping file {i} because 'ROOSTER' sheet doesn't exist.")

for i in schedule_file_names2:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    # Check if the 'ROOSTER' sheet exists in the Excel file
    if 'ROOSTER' in pd.ExcelFile(excel_file).sheet_names:
        temp = pd.read_excel(excel_file, sheet_name='ROOSTER')
        temp['Datum'] = pd.to_datetime(temp['Datum'], errors='coerce')
        temp = temp.dropna(subset=['Datum'])
        temp = temp.dropna(axis=1, how='all')
        transformed_temp = transform_schedule(temp)
        weekly_schedules.append(transformed_temp)
    else:
        print(f"Skipping file {i} because 'ROOSTER' sheet doesn't exist.")        

schedule = pd.concat(weekly_schedules)

In [None]:
#Get task groups
task_groups = pd.read_excel('Procesgroep taken acceptatie.xlsx', sheet_name ='Lijstvorm')
task_groups = task_groups[['Procesgroep', 'Proces']]

In [None]:
#Get working backlog
report_file_names = get_all_file_names(account_url, credential, container_name, folder = reports_folder)

reports = []
for i in report_file_names:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    date = split_name[1].rsplit('MO_', 1)[1].rsplit('_A')[0]
    temp = pd.read_excel(excel_file)
    temp['datum'] = pd.to_datetime(split_name[1].rsplit('MO_', 1)[1].rsplit('_A')[0], format='%Y%m%d').strftime('%Y-%m-%d')
    reports.append(temp)

backlog = pd.concat(reports)

## Prepare data

In [None]:
#Clean working backlog
backlog = backlog[backlog['contractnummer'].notna()]
backlog['contractnummer'] =  backlog['contractnummer'].astype(np.int64)
backlog['datum'] = pd.to_datetime(backlog['datum'])
backlog = backlog.merge(task_groups, left_on='taaknaam', right_on='Proces', how='left')
filtered_backlog = backlog.copy()
# filtered_backlog = backlog[['datum', 'contractnummer', 'Procesgroep_y', 'taaknaam', 'taakomschrijving_aangepast', 'teamcode', 'Uitvoeren voor']]
filtered_backlog.rename(columns={'Procesgroep_y': 'Procesgroep'}, inplace=True)
filtered_backlog = filtered_backlog[filtered_backlog['Procesgroep'] != 'Quion']
filtered_backlog['week_nummer'] = filtered_backlog['datum'].dt.isocalendar().week

# Define the list of team codes you want to keep
team_codes_to_keep = ['Zuid', 'Midden', 'Noord', 'IMD']

# Use the isin method to filter the DataFrame
filtered_backlog = filtered_backlog[filtered_backlog['teamcode'].isin(team_codes_to_keep)]
filtered_backlog.loc[filtered_backlog['productlijn'] == 'Attens Hypotheek', 'teamcode'] = 'Attens'
filtered_backlog = filtered_backlog[filtered_backlog["teamcode"] != "IMD"]
# Group by 'procesgroep' and 'contractnummer', then count unique 'taaknaam' for each group.
unique_tasks_per_group_and_contract = filtered_backlog.groupby(['Procesgroep', 'contractnummer'])['Proces'].nunique().reset_index(name='unique_tasks')

# Now, group by 'procesgroep' again to calculate the average number of unique tasks per procesgroep.
average_unique_tasks_per_procesgroep = unique_tasks_per_group_and_contract.groupby('Procesgroep')['unique_tasks'].mean()

# Define the column names
column_names = ['Procesgroep', 'Normtijd (in minuten)']

# Manually enter the rows with data
rows = [
    ["Aanvragen", 16.8],
    ["Rebound", 21.0],
    ["1e fiat", 55.8],
    ["2e fiat", 27.0],
    ['Afronding dossier', 10.0/2],
    ['Schoningstaken', 10.0]
]

# Create the dataframe
task_times = pd.DataFrame(rows, columns=column_names)

filtered_backlog = filtered_backlog.merge(task_times, left_on="Procesgroep", right_on="Procesgroep", how="left")
filtered_backlog['Normtijd (in minuten)'] = (filtered_backlog['Normtijd (in minuten)'] / 60).round(2)
filtered_backlog.rename(columns={'Normtijd (in minuten)': 'Normtijd (in hours)'}, inplace=True)
filtered_backlog['datum'] = filtered_backlog['datum'] + timedelta(days=1)

In [None]:
#Clean schedule
schedule = schedule.drop_duplicates()
schedule = schedule.fillna(0)

# List of columns to exclude from replacement
exclude_columns = ['Datum', 'Dag', 'Team']

# List of columns representing employee hours
employee_columns = [col for col in schedule.columns if col not in exclude_columns]

# Apply the replacement only to the employee columns
schedule[employee_columns] = schedule[employee_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

# Remove weekends
schedule = schedule[schedule['Dag'] != 0]

In [None]:
start_date_str = '19-02-2024'

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

In [None]:
def add_working_days(start_date_str, working_days=10):
    start_date = datetime.strptime(start_date_str, '%d-%m-%Y')
    end_date = start_date
    added_days = 0

    while added_days < working_days:
        end_date += timedelta(days=1)
        if end_date.weekday() < 5:  # Monday to Friday are considered
            added_days += 1

    return start_date, end_date

# Your start date
start_date, end_date = add_working_days(start_date_str)

# Filtering the DataFrame
specific_schedule = schedule[(schedule['Datum'] >= start_date) & (schedule['Datum'] <= end_date)]
specific_backlog = filtered_backlog[filtered_backlog['datum'] == (start_date).strftime('%Y-%m-%d')]

# Check if the DataFrames are empty and print appropriate messages
if specific_schedule.empty and specific_backlog.empty:
    print("Both 'schedule' and 'backlog' are empty.")
    raise StopExecution
elif specific_schedule.empty:
    print("'schedule' is empty.")
    raise StopExecution
elif specific_schedule.iloc[0]['Dag'] == 'zondag':
    print("Day of the week is sunday.")
    raise StopExecution
elif specific_backlog.empty:
    print("'backlog' is empty.")
    raise StopExecution
elif specific_backlog.empty:
    print("'backlog' is empty.")
    raise StopExecution

## Timeseries analysis

In [None]:
# Convert 'datum' to datetime and make sure 'Normtijd (in hours)' is numeric
filtered_backlog['datum'] = pd.to_datetime(filtered_backlog['datum'])
filtered_backlog['Normtijd (in hours)'] = pd.to_numeric(filtered_backlog['Normtijd (in hours)'], errors='coerce')

# Group by 'teamcode' and 'datum' and sum 'Normtijd (in hours)'
workload = filtered_backlog.groupby(['teamcode', 'datum'])['Normtijd (in hours)'].sum().reset_index()

# Now set 'datum' as index after sorting the DataFrame
workload.sort_values(by='datum', inplace=True)
workload.set_index('datum', inplace=True)

# Get unique team codes
teamcodes = workload['teamcode'].unique()

# Analysis for each teamcode
for teamcode in teamcodes:
    print(f"Analysis for Team: {teamcode}")

    # Extract data for current team
    team_workload = workload[workload['teamcode'] == teamcode]
    # We need to make sure that we have a continuous time series, potentially resampling
    team_workload_daily = team_workload.resample('D').sum().fillna(0)  # Fill missing dates with 0 workloads
    # Seasonal Decomposition
    decomposition = seasonal_decompose(team_workload_daily['Normtijd (in hours)'], model='additive')
    
    # Plotting the decomposition
    print("Seasonal Decomposition:")
    fig = decomposition.plot()
    fig.suptitle(f'Seasonal Decomposition for {teamcode}', fontsize=16)
    axes = fig.get_axes()
    axes[0].set_title("", fontsize=12)
    # plt.title(f'Seasonal Decomposition for {teamcode}')
    plt.savefig(f'Images/decomposition_{teamcode}.png')
    plt.show() 
    
    # Plot ACF and PACF
    print("Autocorrelation Function (ACF) & Partial Autocorrelation Function (PACF):")
    plt.figure(figsize=(12, 6))
    plot_acf(team_workload_daily['Normtijd (in hours)'], lags=40, alpha=0.05, title=f'ACF for {teamcode}')
    plt.savefig(f'Images/ACF_{teamcode}.png')
    plt.show()


    plt.figure(figsize=(12, 6))
    plot_pacf(team_workload_daily['Normtijd (in hours)'], lags=40, alpha=0.05, title=f'PACF for {teamcode}')
    plt.savefig(f'Images/PACF_{teamcode}.png')
    plt.show()
    

## Add columns

In [None]:
# Aggregate total working time per day for each team
daily_work = filtered_backlog.groupby(['teamcode', filtered_backlog['datum'].dt.date])['Normtijd (in hours)'].sum().reset_index(name='total_work_hours')
daily_work['datum'] = pd.to_datetime(daily_work['datum'])

# Pivot the data to have teams as columns and dates as rows
pivot_daily_work = daily_work.pivot(index='datum', columns='teamcode', values='total_work_hours').fillna(0)

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [None]:
# Count occurrences of each Procesgroep per day for each team
procesgroep_counts = filtered_backlog.groupby(['teamcode', 'datum', 'Procesgroep']).size().reset_index(name='count')
procesgroep_pivot = procesgroep_counts.pivot_table(index=['teamcode', 'datum'], columns='Procesgroep', values='count', fill_value=0).reset_index()

# Assuming procesgroep_pivot is sorted by 'teamcode' and 'datum'
procesgroep_pivot = procesgroep_pivot.sort_values(by=['teamcode', 'datum'])

# Apply the rolling sum function, avoiding 'teamcode' in the numeric operation
def calculate_rolling_sums(group):
    # Exclude 'teamcode' from the rolling operation
    group = group.set_index('datum')
    numeric_columns = group.select_dtypes(include=[np.number])
    rolled = numeric_columns.rolling(window='7D', closed='left').sum()
    # Include 'teamcode' and 'datum' back
    rolled = rolled.reset_index()
    
    rolled['teamcode'] = group['teamcode'].iloc[0]  # Safe because 'group' is grouped by 'teamcode'
    return rolled

# Group by 'teamcode' and apply the rolling sum calculation
rolling_sums = procesgroep_pivot.groupby('teamcode', as_index=False).apply(calculate_rolling_sums).reset_index(drop=True)

# # Ensure 'datum' is in the correct datetime format after manipulations
rolling_sums['datum'] = pd.to_datetime(rolling_sums['datum'])

# # Merge the rolling sums with the daily_work DataFrame to align with the target variable
daily_work_with_exog = pd.merge(daily_work, rolling_sums, on=['teamcode', 'datum'], how='left').fillna(0)

In [None]:
# Count occurrences of each type_regeling per day for each team
type_regeling_counts = filtered_backlog.groupby(['teamcode', 'datum', 'type_regeling']).size().reset_index(name='count')
type_regeling_pivot = type_regeling_counts.pivot_table(index=['teamcode', 'datum'], columns='type_regeling', values='count', fill_value=0).reset_index()

# Ensure the dataframe is sorted
type_regeling_pivot = type_regeling_pivot.sort_values(by=['teamcode', 'datum'])

# Apply the rolling sum function to the type_regeling pivot
rolling_sums_type_regeling = type_regeling_pivot.groupby('teamcode', as_index=False).apply(calculate_rolling_sums).reset_index(drop=True)

# Ensure 'datum' is in the correct datetime format after manipulations
rolling_sums_type_regeling['datum'] = pd.to_datetime(rolling_sums_type_regeling['datum'])

# Merge the new rolling sums with the previously prepared daily_work_with_exog DataFrame
daily_work_with_exog = pd.merge(daily_work_with_exog, rolling_sums_type_regeling, on=['teamcode', 'datum'], how='left', suffixes=('', '_type_regeling')).fillna(0)

# # The suffixes parameter adds a suffix to overlapping column names except for the joining keys (teamcode and datum here)
pivot = daily_work_with_exog.pivot(index='datum', columns='teamcode', values=[column for column in daily_work_with_exog.columns if column not in ('teamcode', 'datum')]).fillna(0)

In [None]:
# Initialize DataFrame to store AIC, BIC, and parameters
results_df = pd.DataFrame(columns=['Team', 'AIC', 'BIC', 'Order', 'Seasonal_Order'])
metrics_df = pd.DataFrame(columns=['Team', 'MSE', 'MAE', 'R2'])

# Updated time series cross-validation function
def time_series_cv(data, exog_data, pdq, seasonal_pdq, splits):
    results = []
    tscv = TimeSeriesSplit(n_splits=splits)
    param_results = []

    for param in pdq:
        for param_seasonal in seasonal_pdq:
            fold_aic = []
            fold_bic = []

            for train_index, test_index in tscv.split(data):
                train, test = data.iloc[train_index], data.iloc[test_index]
                train_exog, test_exog = exog_data.iloc[train_index], exog_data.iloc[test_index]

                try:
                    model = SARIMAX(train,
                                    exog=train_exog,
                                    order=param,
                                    seasonal_order=param_seasonal,
                                    enforce_stationarity=False,
                                    enforce_invertibility=False)
                    model_fit = model.fit(disp=0)
                    fold_aic.append(model_fit.aic)
                    fold_bic.append(model_fit.bic)
                except Exception as e:
                    print(f"An error occurred with parameters {param}, {param_seasonal}: {e}")
                    continue

            if fold_aic:  # Ensure there's at least one valid result
                avg_aic = np.mean(fold_aic)
                avg_bic = np.mean(fold_bic)
                param_results.append({
                    'Order': param,
                    'Seasonal_Order': param_seasonal,
                    'AIC': avg_aic,
                    'BIC': avg_bic
                })
                print(f"Evaluated parameters {param} and {param_seasonal}: AIC = {avg_aic}, BIC = {avg_bic}")

    # Find the best parameters based on the lowest average AIC
    best_result = min(param_results, key=lambda x: x['AIC'])
    best_order = best_result['Order']
    best_seasonal_order = best_result['Seasonal_Order']

    return param_results, best_order, best_seasonal_order

# Assume pivot_daily_work and daily_work_with_exog are defined somewhere
validation_start_date = pd.Timestamp('2024-02-19')

# Loop over each team and process SARIMAX models
for team in ['Zuid']:
    print(f"Processing team: {team}")

    # Example data extraction; replace with your actual data logic
    team_series = pivot.loc[:, ('total_work_hours', team)].asfreq('D').fillna(0)
    team_exog = pivot.loc[:, ([column for column in daily_work_with_exog.columns if column not in ('teamcode', 'datum', 'total_work_hours')], team)].asfreq('D').fillna(0)

    # Split the data into training/testing and validation sets
    train_test_series = team_series[:validation_start_date - pd.Timedelta(days=1)]
    train_test_exog = team_exog[:validation_start_date - pd.Timedelta(days=1)]
    validate_series = team_series[validation_start_date:]
    validate_exog = team_exog[validation_start_date:]

    # Parameter ranges (adjust as necessary based on earlier discussion)
    p = q = P = Q  range(1, 7)
    d = D = range(0, 2)
    m = 7  # Weekly seasonality

    # Generate all different combinations of p, d, q and P, D, Q
    pdq = list(itertools.product(p, d, q))
    seasonal_pdq = [(x[0], x[1], x[2], m) for x in list(itertools.product(P, D, Q))]

    # Execute the cross-validation
    team_results, best_order, best_seasonal_order = time_series_cv(train_test_series, train_test_exog, pdq, seasonal_pdq, splits=3)

    # Append results to DataFrame with the team label
    for result in team_results:
        result['Noord'] = team  # Add team name to each result
        results_df = pd.concat([results_df, pd.DataFrame([result])], ignore_index=True)

    print(f"Completed processing for team: {team}")

    # Fit the model with the best parameters
    best_model = SARIMAX(train_test_series,
                         exog=train_test_exog,
                         order=best_order,
                         seasonal_order=best_seasonal_order,
                         enforce_stationarity=False,
                         enforce_invertibility=False)
    results = best_model.fit()
    
    # Summary of the model
    print(results.summary())

    # Diagnostic plots
    results.plot_diagnostics(figsize=(15, 12))
    plt.title(f'SARIMAX Diagnostics for {team}')
    plt.savefig(f'Images/Diagnostics_{team}.png')
    plt.show()

    # Forecast the validation period
    forecast = results.get_forecast(steps=len(validate_series), exog=validate_exog)
    forecast_mean = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    # Placeholder for the target DataFrame
    target_df = None

    # Iterate through each DataFrame in the list
    for df in weekly_schedules:
        # Check if the first row of the "Datum" column matches the desired date
        if df.loc[0, "Datum"] == validation_start_date:
            target_df = df
            break

    # Now, target_df holds the DataFrame of interest or None if not found
    if target_df is not None:
        target_df = target_df[~target_df["Dag"].isin(["zaterdag", "zondag"])]
        target_df = target_df[target_df["Team"] != "OD"]

        # Prepare the plotting data
        actual_hours_per_day = filtered_backlog.groupby(['teamcode', 'datum'])['Normtijd (in hours)'].sum().reset_index()
        actual_hours_per_day = actual_hours_per_day[(actual_hours_per_day['datum'] >= validation_start_date) & (actual_hours_per_day['datum'] <= validation_start_date + pd.Timedelta(days=11))]

        target_df = target_df[~target_df["Dag"].isin(["zaterdag", "zondag"])]
        target_df['datum'] = pd.to_datetime(target_df['Datum'])  # Ensure 'Datum' in target_df is datetime
        forecasts = round(forecast_mean[~forecast_mean.index.weekday.isin([5, 6])], 0).clip(0)  # 5 and 6 correspond to Saturday and Sunday, round and set negative to 0

        team_target = target_df[target_df['Team'] == team]
        team_target = team_target.set_index('datum').sort_index()

        dates = team_target.index.strftime('%Y-%m-%d').tolist()

        team_forecasts = forecasts.reindex(team_target.index, fill_value=0)
        team_conf_int = forecast_ci.reindex(team_target.index, fill_value=0)

        # Filtering actual hours for the current team
        team_actual_hours = actual_hours_per_day[actual_hours_per_day['teamcode'] == team]
        team_actual_hours = team_actual_hours.set_index('datum').reindex(dates, fill_value=0)

        team_target_filtered = team_target[team_target.index.isin(dates)]

        lower_bounds = team_conf_int.iloc[:, 0].clip(lower=0)
        upper_bounds = team_conf_int.iloc[:, 1]

        # Calculate validation metrics
        mse = mean_squared_error(team_actual_hours['Normtijd (in hours)'], team_forecasts)
        mae = mean_absolute_error(team_actual_hours['Normtijd (in hours)'], team_forecasts)
        metrics_df = pd.concat([metrics_df, pd.DataFrame([{
            'Team': team,
            'MSE': mse,
            'MAE': mae
        }])], ignore_index=True)

        # Plotting for the current team
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.plot(dates, team_target_filtered['TotalHours'], color='red', marker='o', linestyle='-', linewidth=2, markersize=5, label='Available Working Hours')
        ax.plot(dates, team_forecasts, color='green', marker='x', linestyle='--', linewidth=2, markersize=5, label=f'Forecasted Hours for {team}')
        ax.bar(dates, team_actual_hours['Normtijd (in hours)'], width=0.4, label=f'Actual Hours for {team}', align='center', color='purple')
        ax.fill_between(dates, lower_bounds, upper_bounds, color='green', alpha=0.3, label='Confidence Interval')

        # Set the x-axis ticks to the dates and label them with the corresponding dates
        ax.set_xticks(dates)
        ax.set_xticklabels(dates, rotation=45)

        # Formatting
        ax.set_xlabel('Date')
        ax.set_ylabel('Hours')
        ax.set_title(f'Scheduled vs Target Hours for Team {team}')
        ax.legend(loc='upper right')

        plt.tight_layout()
        plt.show()

# Save results to CSV
results_df.to_csv('model_selection_results.csv', index=False)
print("Results saved to model_selection_results.csv")

# Save validation metrics to CSV
metrics_df.to_csv('validation_metrics.csv', index=False)
print("Validation metrics saved to validation_metrics.csv")

## Plot predictions

In [None]:
#Get specefic schedule

# Placeholder for the target DataFrame
target_df = None

# Iterate through each DataFrame in the list
for df in weekly_schedules:
    # Check if the first row of the "Datum" column matches the desired date
    if df.loc[0, "Datum"] == start_date:
        target_df = df
        break

# Now, target_df holds the DataFrame of interest or None if not found
if target_df is not None:
    None
else:
    print("DataFrame with the specified date not found.")
    
target_df = target_df[~target_df["Dag"].isin(["zaterdag", "zondag"])]

In [None]:
#Get real schedule
actual_hours_per_day = filtered_backlog.groupby(['teamcode', 'datum'])['Normtijd (in hours)'].sum().reset_index()
actual_hours_per_day = actual_hours_per_day[(actual_hours_per_day['datum'] >= start_date) & (actual_hours_per_day['datum'] <= start_date + timedelta(days=11))]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Prepare the Target Hours Data
target_df['datum'] = pd.to_datetime(target_df['Datum'])  # Ensure 'Datum' in target_df is datetime

forecasts = round(forecasts[~forecasts.index.weekday.isin([5, 6])], 0).clip(0)  # 5 and 6 correspond to Saturday and Sunday, round and set negative to 0

# Iterate over each team to plot
for team in actual_hours_per_day['teamcode'].unique():
    team_target = target_df[target_df['Team'] == team]

    # Preparing data for plotting
    team_target = team_target.set_index('datum').sort_index()
    
    dates = team_target.index.strftime('%Y-%m-%d').tolist()

    team_forecasts = forecasts[team].reindex(team_target.index, fill_value=0)
    team_conf_int = conf_int_forecasts[team].reindex(team_target.index, fill_value=0)

    # Filtering actual hours for the current team
    team_actual_hours = actual_hours_per_day[actual_hours_per_day['teamcode'] == team]
    team_actual_hours = team_actual_hours.set_index('datum').reindex(dates, fill_value=0)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 6))
        
    team_target_filtered = team_target[team_target.index.isin(dates)]

    # Target (Available) Hours Line Plot
    ax.plot(dates, team_target_filtered['TotalHours'], color='red', marker='o', linestyle='-', linewidth=2, markersize=5, label='Available Working Hours')
    ax.plot(dates, team_forecasts, color='green', marker='x', linestyle='--', linewidth=2, markersize=5, label='Forecasted Hours')
    
    lower_bounds = team_conf_int.iloc[:, 0].clip(lower=0)
    upper_bounds = team_conf_int.iloc[:, 1]
    
    # Plot with adjusted confidence intervals
    ax.fill_between(dates, lower_bounds, upper_bounds, color='green', alpha=0.3, label='Confidence Interval')
    
    x_positions = range(len(dates))  # Numeric x positions for the dates
    ax.bar([x for x in x_positions], team_actual_hours['Normtijd (in hours)'], width=0.4, label='Actual Hours', align='center', color='purple')
    
    # Set the x-axis ticks to the dates and label them with the corresponding dates
    ax.set_xticks(dates)
    ax.set_xticklabels(dates, rotation=45)

    # Formatting
    ax.set_xlabel('Date')
    ax.set_ylabel('Hours')
    ax.set_title(f'Scheduled vs Target Hours for Team {team}')
    ax.legend(loc='upper right')

    plt.tight_layout()

    # Show plot
    plt.show()
