In [1]:
from azure.identity import AzureCliCredential
from azure.storage.filedatalake import DataLakeFileClient
from azure.storage.filedatalake import DataLakeServiceClient
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, timedelta
import plotly.express as px
import pickle
import re
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pulp import LpProblem, LpMinimize, LpMaximize, LpVariable, lpSum, LpStatus, value, LpBinary

In [2]:
def get_credential():
    # Haal het token op wat met az login is aangemaakt
    return AzureCliCredential()


def read_file(credential, storage_account, container,
                  filepath):
    account_url = "https://{}.dfs.core.windows.net".format(storage_account)

    file_client = DataLakeFileClient(account_url=account_url,
                                     file_system_name=container,
                                     file_path=filepath,
                                     credential=credential)

    downloaded_bytes = io.BytesIO(file_client.download_file().readall())
    return downloaded_bytes


def write_file(buffer, credential, storage_account, container,
                filepath):
    account_url = "https://{}.dfs.core.windows.net".format(storage_account)

    file_client = DataLakeFileClient(account_url=account_url,
                                     file_system_name=container,
                                     file_path=filepath,
                                     credential=credential)

    file_client.upload_data(buffer.getvalue(), overwrite=True)

In [3]:
# Haal het token op wat met az login is aangemaakt
credential = get_credential()

# Definieer storage-account en containername
storage_account_name = ""
container_name = ""
account_url = "https://{}.dfs.core.windows.net".format(storage_account_name)
reports_folder = "426_rapportage/"
schedule_folder = "werkvoorraad_planning/"
schedule2_folder = "werkverdeling_acceptatie/"

In [4]:
# Read file
import os
import pyarrow.parquet as pq

def get_df(folder = "426_rapportage/", file_name = "426_werkvoorraad_MO_20230131_AMB.xlsx"):
    file_path = folder + '/' + file_name

    file_bytes = read_file(credential=credential,
                  storage_account=storage_account_name,
                  container=container_name,
                  filepath=file_path)
    
    return(file_bytes)

In [5]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_all_file_names(account_url, credential, container_name, folder):
    # Get a reference to the file system and directory
    service_client = DataLakeServiceClient(account_url=account_url, credential=credential)
    file_system_client = service_client.get_file_system_client(container_name)

    file_list = []
    # List files in the directory
    files = file_system_client.get_paths()
    for file in files:
        if ((file.name.startswith(folder)) & (".xlsx" in file.name)):
            file_list.append(file.name)
    return file_list

In [6]:
# Function to extract filename with the specific date
def extract_filename_with_date(filenames, date):
    for filename in filenames:
        if date in filename:
            return filename
    return None  # Return None if no matching filename is found

# Function to convert YYYYMMDD to DD-MM-YYYY
def convert_date_format(yyyymmdd):
    return f"{yyyymmdd[6:8]}-{yyyymmdd[4:6]}-{yyyymmdd[0:4]}"

# Function to extract filename with the specific date, considering the new format
def extract_filename_with_converted_date(filenames, date):
    for filename in filenames:
        # Use regular expressions to find the date in YYYYMMDD format within the filename
        match = re.search(r'\d{8}', filename)
        if match:
            date_in_file = match.group(0)  # Extract the date string
            converted_date = convert_date_format(date_in_file)  # Convert to DD-MM-YYYY
            if converted_date == date:
                return filename

## Load data into environment

In [7]:
#Function to create an aggregated dataframe of the employee availability.
def transform_schedule(df, skill_lookup):
    weekly_schedule_df = df.copy()

    # Identifying team names and their respective employee columns
    team_columns = [col for col in weekly_schedule_df.columns if 'Team' in col]

    # Mapping each team to its members
    team_member_map = {}
    for i, team_col in enumerate(team_columns):
        team_idx = weekly_schedule_df.columns.get_loc(team_col)
        if i < len(team_columns) - 1:
            next_team_idx = weekly_schedule_df.columns.get_loc(team_columns[i + 1])
            members = weekly_schedule_df.columns[team_idx + 1:next_team_idx]
        else:
            members = weekly_schedule_df.columns[team_idx + 1:]

        team_member_map[weekly_schedule_df[team_col].iloc[0]] = members

    # Transforming the dataframe
    new_data = []
    for index, row in weekly_schedule_df.iterrows():
        for team_name, team_members in team_member_map.items():
            total_hours = row[team_members].apply(pd.to_numeric, errors='coerce').sum()
            all_skills_hours = 0
            not_all_skills_hours = 0

            for member in team_members:
                first_name = member
                hours = pd.to_numeric(row[first_name], errors='coerce')
                if pd.isna(hours):
                    hours = 0  # Set NaN values to 0
                skill = skill_lookup.get(first_name, 'Acceptant A')  # Default to 'Acceptant A' if not found
                
                if skill == 'Acceptant B':
                    not_all_skills_hours += hours
                else:
                    all_skills_hours += hours

            new_data.append({
                'Datum': row['Datum'],
                'Dag': row['Dag'],
                'Team': team_name,
                'TotalHours': total_hours,
                'AllSkillsHours': all_skills_hours,
                'NotAllSkillsHours': not_all_skills_hours
            })

    # Create the new dataframe
    transformed_df = pd.DataFrame(new_data)

    # Display the first few rows of the new dataframe
    return transformed_df

In [8]:
#Load in skill matrix and make lookup file
skill_matrix = pd.read_excel("Skillmatrix.xlsx", skiprows=2)

def extract_first_name(full_name):
    return full_name.split()[0]

# Step 1: Extract first names and create a mapping from first name to full name and skill
skill_matrix['FirstName'] = skill_matrix['Naam'].apply(extract_first_name)

# Step 2: Create a dictionary for skill lookup
skill_lookup = skill_matrix.set_index('FirstName')['Medewerkerprofiel'].to_dict()

In [9]:
#Get latest working schedule
schedule_file_names = get_all_file_names(account_url, credential, container_name, folder = schedule_folder)
schedule_file_names2 = get_all_file_names(account_url, credential, container_name, folder = schedule2_folder)

weekly_schedules = []
for i in schedule_file_names:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    # Check if the 'ROOSTER' sheet exists in the Excel file
    if 'ROOSTER' in pd.ExcelFile(excel_file).sheet_names:
        temp = pd.read_excel(excel_file, sheet_name='ROOSTER')
        temp['Datum'] = pd.to_datetime(temp['Datum'], errors='coerce')
        temp = temp.dropna(subset=['Datum'])
        temp = temp.dropna(axis=1, how='all')
        transformed_temp = transform_schedule(temp, skill_lookup)
        weekly_schedules.append(transformed_temp)
        # weekly_schedules.append(temp)
    else:
        print(f"Skipping file {i} because 'ROOSTER' sheet doesn't exist.")

for i in schedule_file_names2:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    # Check if the 'ROOSTER' sheet exists in the Excel file
    if 'ROOSTER' in pd.ExcelFile(excel_file).sheet_names:
        temp = pd.read_excel(excel_file, sheet_name='ROOSTER')
        temp['Datum'] = pd.to_datetime(temp['Datum'], errors='coerce')
        temp = temp.dropna(subset=['Datum'])
        temp = temp.dropna(axis=1, how='all')
        transformed_temp = transform_schedule(temp, skill_lookup)
        weekly_schedules.append(transformed_temp)
    else:
        print(f"Skipping file {i} because 'ROOSTER' sheet doesn't exist.")        

schedule = pd.concat(weekly_schedules)

Skipping file werkvoorraad_planning/oktober/11-10-2023 Werkverdeling Acceptatie.xlsx because 'ROOSTER' sheet doesn't exist.


In [10]:
#Get task groups
task_groups = pd.read_excel('Procesgroep taken acceptatie.xlsx', sheet_name ='Lijstvorm')
task_groups = task_groups[['Procesgroep', 'Proces']]

In [11]:
#Get working backlog
report_file_names = get_all_file_names(account_url, credential, container_name, folder = reports_folder)

reports = []
for i in report_file_names:
    split_name = i.rsplit('/',1)
    excel_file = get_df(folder=split_name[0] + '/', file_name=split_name[1])
    date = split_name[1].rsplit('MO_', 1)[1].rsplit('_A')[0]
    temp = pd.read_excel(excel_file)
    temp['datum'] = pd.to_datetime(split_name[1].rsplit('MO_', 1)[1].rsplit('_A')[0], format='%Y%m%d').strftime('%Y-%m-%d')
    reports.append(temp)

backlog = pd.concat(reports)

## Prepare data

In [12]:
#Clean working backlog
backlog = backlog[backlog['contractnummer'].notna()]
backlog['contractnummer'] =  backlog['contractnummer'].astype(np.int64)
backlog['datum'] = pd.to_datetime(backlog['datum'])
backlog = backlog.merge(task_groups, left_on='taaknaam', right_on='Proces', how='left')
filtered_backlog = backlog.copy()
# filtered_backlog = backlog[['datum', 'contractnummer', 'Procesgroep_y', 'taaknaam', 'taakomschrijving_aangepast', 'teamcode', 'Uitvoeren voor']]
filtered_backlog.rename(columns={'Procesgroep_y': 'Procesgroep'}, inplace=True)
filtered_backlog = filtered_backlog[filtered_backlog['Procesgroep'] != 'Quion']
filtered_backlog['week_nummer'] = filtered_backlog['datum'].dt.isocalendar().week

# Define the list of team codes you want to keep
team_codes_to_keep = ['Zuid', 'Midden', 'Noord', 'IMD']

# Use the isin method to filter the DataFrame
filtered_backlog = filtered_backlog[filtered_backlog['teamcode'].isin(team_codes_to_keep)]
filtered_backlog.loc[filtered_backlog['productlijn'] == 'Attens Hypotheek', 'teamcode'] = 'Attens'
filtered_backlog = filtered_backlog[filtered_backlog["teamcode"] != "IMD"]
# Group by 'procesgroep' and 'contractnummer', then count unique 'taaknaam' for each group.
unique_tasks_per_group_and_contract = filtered_backlog.groupby(['Procesgroep', 'contractnummer'])['Proces'].nunique().reset_index(name='unique_tasks')

# Now, group by 'procesgroep' again to calculate the average number of unique tasks per procesgroep.
average_unique_tasks_per_procesgroep = unique_tasks_per_group_and_contract.groupby('Procesgroep')['unique_tasks'].mean()

# Define the column names
column_names = ['Procesgroep', 'Normtijd (in minuten)']

# Manually enter the rows with data
rows = [
    ["Aanvragen", 16.8],
    ["Rebound", 21.0],
    ["1e fiat", 55.8],
    ["2e fiat", 27.0],
    ['Afronding dossier', 10.0/2],
    ['Schoningstaken', 10.0]
]

# Create the dataframe
task_times = pd.DataFrame(rows, columns=column_names)

filtered_backlog = filtered_backlog.merge(task_times, left_on="Procesgroep", right_on="Procesgroep", how="left")
filtered_backlog['Normtijd (in minuten)'] = (filtered_backlog['Normtijd (in minuten)'] / 60).round(2)
filtered_backlog.rename(columns={'Normtijd (in minuten)': 'Normtijd (in hours)'}, inplace=True)
filtered_backlog['datum'] = filtered_backlog['datum'] + timedelta(days=1)

In [14]:
#Clean schedule
schedule = schedule.drop_duplicates()
schedule = schedule.fillna(0)

# List of columns to exclude from replacement
exclude_columns = ['Datum', 'Dag', 'Team']

# List of columns representing employee hours
employee_columns = [col for col in schedule.columns if col not in exclude_columns]

# Apply the replacement only to the employee columns
schedule[employee_columns] = schedule[employee_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

team_codes_to_keep = ['Zuid', 'Midden', 'Noord', 'Attens']
schedule = schedule[schedule['Team'].isin(team_codes_to_keep)]

schedule = schedule.drop_duplicates(subset=['Datum', 'Dag', 'Team'], keep='first')

In [15]:
# Define your start and end date for the date range
start_date = '06-12-2023'
end_date = '2023-06-16'


# filtered_backlog = filtered_backlog[(filtered_backlog['datum'] >= start_date) & (filtered_backlog['Uitvoeren voor'] <= end_date)]
# Filter the dates within the specified range and exclude weekends
filtered_backlog = filtered_backlog[
    (filtered_backlog['datum'] >= start_date) & 
    (filtered_backlog['Uitvoeren voor'] <= end_date)  # Monday=0, Sunday=6
]

schedule = schedule[
    (schedule['Datum'] >= start_date) & 
    (schedule['Datum'] <= end_date) &
    (schedule['Datum'].dt.weekday < 5)  # Monday=0, Sunday=6
]

In [17]:
# Sort tasks by deadline ('Uitvoeren voor')
filtered_backlog.sort_values(by='Uitvoeren voor', inplace=True)
backlog_copy = filtered_backlog.copy()

# Initialize lists to track scheduled and unscheduled tasks
unscheduled_tasks_list = []
scheduled_tasks_list = []

# Iterate over each unique team in the schedule
for team in schedule['Team'].unique():
    team_schedule = schedule[schedule['Team'] == team]

    # Iterate over each day's schedule for the team
    for _, day_info in team_schedule.iterrows():
        date = day_info['Datum']
        total_available_hours = day_info['TotalHours']
        all_skills_hours = day_info['AllSkillsHours']
        not_all_skills_hours = day_info['NotAllSkillsHours']

        # Filter tasks that can be potentially scheduled on this date for this team
        possible_tasks = backlog_copy[(backlog_copy['teamcode'] == team) & 
                                          (backlog_copy['datum'] <= date)]

        # Iterate over the tasks that can be scheduled on this date
        for task_index, task in possible_tasks.iterrows():
            task_hours = task['Normtijd (in hours)']
            procesgroep = task['Procesgroep']

            if procesgroep == '2e fiat':
                # Task requires AllSkillsHours
                if task_hours <= all_skills_hours:
                    # Schedule the task
                    scheduled_tasks_list.append(task)
                    all_skills_hours -= task_hours
                    total_available_hours -= task_hours

                    # Remove the task from backlog
                    backlog_copy.drop(task_index, inplace=True)
                else:
                    # Check if the task can be deferred
                    next_possible_dates = team_schedule[team_schedule['Datum'] > date]
                    if not next_possible_dates.empty and task['Uitvoeren voor'] >= next_possible_dates.iloc[0]['Datum']:
                        # Task can be considered in next dates
                        continue
                    else:
                        # Task cannot be scheduled at all
                        unscheduled_tasks_list.append(task)
                        backlog_copy.drop(task_index, inplace=True)
            else:
                # Task can use either AllSkillsHours or NotAllSkillsHours
                if task_hours <= total_available_hours and (task_hours <= all_skills_hours or task_hours <= not_all_skills_hours):
                    # Schedule the task
                    scheduled_tasks_list.append(task)
                    if task_hours <= all_skills_hours:
                        all_skills_hours -= task_hours
                    else:
                        not_all_skills_hours -= task_hours
                    total_available_hours -= task_hours

                    # Remove the task from backlog
                    backlog_copy.drop(task_index, inplace=True)
                else:
                    # Check if the task can be deferred
                    next_possible_dates = team_schedule[team_schedule['Datum'] > date]
                    if not next_possible_dates.empty and task['Uitvoeren voor'] >= next_possible_dates.iloc[0]['Datum']:
                        # Task can be considered in next dates
                        continue
                    else:
                        # Task cannot be scheduled at all
                        unscheduled_tasks_list.append(task)
                        backlog_copy.drop(task_index, inplace=True)

# Convert the list of dictionaries to a DataFrame
scheduled_tasks = pd.DataFrame(scheduled_tasks_list)
unscheduled_tasks = pd.DataFrame(unscheduled_tasks_list)


In [None]:
# Aggregate scheduled task hours by day and team
scheduled_hours_per_day = scheduled_tasks.groupby(['teamcode', 'datum'])['Normtijd (in hours)'].sum().reset_index()

# Check if unscheduled_tasks is not empty before aggregating
if not unscheduled_tasks.empty:
    unscheduled_hours_per_day = unscheduled_tasks.groupby(['teamcode', 'datum'])['Normtijd (in hours)'].sum().reset_index()
else:
    unscheduled_hours_per_day = pd.DataFrame()  # Create an empty DataFrame if unscheduled_tasks is empty

# Convert 'datum' to a more convenient format (e.g., YYYY-MM-DD) if necessary
scheduled_hours_per_day['datum'] = pd.to_datetime(scheduled_hours_per_day['datum']).dt.strftime('%Y-%m-%d')
if not unscheduled_hours_per_day.empty:
    unscheduled_hours_per_day['datum'] = pd.to_datetime(unscheduled_hours_per_day['datum']).dt.strftime('%Y-%m-%d')

# Iterate over each team to plot
for team in scheduled_tasks['teamcode'].unique():
    team_schedule = schedule[schedule['Team'] == team].copy()  # Make a copy here    
    team_scheduled = scheduled_hours_per_day[scheduled_hours_per_day['teamcode'] == team]
    if not unscheduled_hours_per_day.empty:
        team_unscheduled = unscheduled_hours_per_day[unscheduled_hours_per_day['teamcode'] == team]
    else:
        team_unscheduled = pd.DataFrame(columns=['teamcode', 'datum', 'Normtijd (in hours)'])  # Create an empty DataFrame with columns for compatibility
    
    # Ensure 'Datum' in team_schedule is converted to datetime for accurate plotting
    team_schedule['Datum'] = pd.to_datetime(team_schedule['Datum'])

    # Create a date range that includes all days, filling the gaps (e.g., weekends)
    all_dates = pd.date_range(start=team_schedule['Datum'].min(), end=team_schedule['Datum'].max())

    # Ensure 'datum' in team_scheduled and team_unscheduled is converted to datetime
    team_scheduled['datum'] = pd.to_datetime(team_scheduled['datum'])
    if not team_unscheduled.empty:
        team_unscheduled['datum'] = pd.to_datetime(team_unscheduled['datum'])

    # Merge scheduled and unscheduled dataframes with all_dates to ensure all dates are included
    all_dates_df = pd.DataFrame(all_dates, columns=['Datum'])
    combined_hours = pd.merge(all_dates_df, team_scheduled, left_on='Datum', right_on='datum', how='left')
    combined_hours = pd.merge(combined_hours, team_unscheduled, on='datum', how='outer', suffixes=('_scheduled', '_unscheduled')).fillna(0)

    # Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Scheduled task hours with turquoise color
    ax.bar(combined_hours['Datum'], combined_hours['Normtijd (in hours)_scheduled'], width=0.4, label='Scheduled Hours', align='center', color='tab:blue')

    # Unscheduled task hours
    if not unscheduled_hours_per_day.empty:
        ax.bar(combined_hours['Datum'], combined_hours['Normtijd (in hours)_unscheduled'], width=0.4, label='Unscheduled Hours', align='center', color='orange', bottom=combined_hours['Normtijd (in hours)_scheduled'])

    # Available hours with gaps for weekends or non-working days
    available_hours_mask = team_schedule['TotalHours'] > 0
    ax.plot(team_schedule.loc[available_hours_mask, 'Datum'], team_schedule.loc[available_hours_mask, 'TotalHours'], color='red', marker='o', linestyle='-', linewidth=2, markersize=8, label='Available Hours')

    # Formatting
    ax.set_xlabel('Date')
    ax.set_ylabel('Hours')
    ax.set_title(f'Task Schedule for Team: {team}')
    ax.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'Images/HeuristicSchedule{team}.png')
    # Show plot
    plt.show()