In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import os
import ipywidgets as widgets
from IPython.display import display
import datetime

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Hyperparameters & locations
</div>

In [9]:
locs = {'waiting_folder' : '../data/waiting times'}

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Reading the data
</div>

In [None]:
waiting_times = list()
for attraction in tqdm(os.listdir(locs['waiting_folder'])):
    filename = os.path.join(locs['waiting_folder'], attraction)
    df = pd.read_csv(filename)
    if df.empty:
        continue
    df.insert(0, 'attraction', attraction.split('.')[0])
    waiting_times.append(df)

df_wait_raw = pd.concat(waiting_times, ignore_index=True)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Cleaning the data
</div>

In [None]:
df_wait = df_wait_raw.copy()
# Cleaning the actual waiting times
# Removing outliers from actuals
df_wait = df_wait[((df_wait.SACTMIN >= -1000) & (df_wait.SACTMIN < 360)) | (df_wait.SACTMIN.isnull())]
# Removing outliers from posted (attraction closed at -999)
df_wait = df_wait[(df_wait.SPOSTMIN >= -998.99) | (df_wait.SPOSTMIN.isnull())]

df_wait['date'] = pd.to_datetime(df_wait.date, format = '%m/%d/%Y')
df_wait['datetime'] = pd.to_datetime(df_wait.datetime, format = '%Y-%m-%d %H:%M:%S')

print(f"Removed {len(df_wait_raw) - len(df_wait)} rows")

# You could split the dataset into two seperate dataframes (plusjes & minnetjes :-))
df_wait_act = df_wait[~df_wait.SACTMIN.isnull()].drop('SPOSTMIN', axis = 1)
df_wait_post = df_wait[~df_wait.SPOSTMIN.isnull()].drop('SACTMIN', axis = 1)

attractions = df_wait.attraction.unique()

In [12]:
# Extract "minutes from df_wait" from the datetime column
df_wait_post['minute'] = df_wait_post['datetime'].dt.hour * 60 + df_wait_post['datetime'].dt.minute

In [None]:
df_wait_post['attraction'].unique()

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    SPOSTMIN Resampling index to every 30 mins with interpolation
</div>

In [None]:
## This code starts from the posted waiting times. 
# Reindex and resample to a datapoint every 30 minutes. 
# Add leading zeros and fill gaps with interpolated values.

groups_post = list()

for (date, attraction), group in tqdm(df_wait_post.groupby(['date', 'attraction'])):
    # Time shift by 3 hours
    time_shifted = group['datetime'] - datetime.timedelta(hours=3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Drop unnecessary columns and compute the mean for duplicate minutes
    group = group.drop(columns=['datetime', 'attraction', 'date'], errors='ignore')  # Avoid KeyError
    group = group.groupby('minute', as_index=False)[['SPOSTMIN']].mean()

    # Define the new index (0 to 1620 minutes in 30-minute intervals)
    new_index = np.arange(0, 27 * 60 + 1, 30)

    # Set 'minute' as the index and reindex with interpolation
    group = group.set_index('minute')
    reindexed = group.reindex(np.unique(np.concatenate([new_index, group.index]))).interpolate(method='linear')

    # Keep only the required time indices
    resampled = reindexed.loc[new_index].reset_index()

    # Round to nearest 5-minute interval
    resampled['SPOSTMIN'] = resampled['SPOSTMIN'].fillna(0)
    resampled['SPOSTMIN'] = (resampled['SPOSTMIN'] + 2.5) // 5 * 5

    # Insert date and attraction columns
    resampled.insert(0, 'date', date)
    resampled.insert(0, 'attraction', attraction)

    # Append to final list
    groups_post.append(resampled)

In [15]:
# Combine
df_post_itp = pd.concat(groups_post, ignore_index=True)

In [None]:
df_post_itp.sample(5)

In [17]:
# Save dataset with interpolates posted times to csv
df_post_itp.to_csv("../data/clean/posted_interpolated.csv", index=False)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    SACTMIN timeshift for minute column. No resampling or interpolation
</div>

In [None]:
## This code starts from the actual waiting times.
# Apply a time shift to get the minute column similar to the dataset with posted waiting times.

groups_act = list()

for (date, attraction), group in tqdm(df_wait_act.groupby(['date', 'attraction'])):
    # Time shift by 3 hours
    time_shifted = group['datetime'] - datetime.timedelta(hours=3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Append to final list
    groups_act.append(group)

In [19]:
df_act_timeshift = pd.concat(groups_act, ignore_index=True)

In [None]:
df_act_timeshift.drop(columns = ['datetime'])

In [21]:
# Safe dataset with timeshifted actuals to csv
df_act_timeshift.to_csv("../data/clean/actuals_shifted.csv", index=False)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Interpolating and preparing training dataframe
</div>

In [None]:
## This code starts from the entire dataset of posted and actual waiting times.
# For every row where there is an actual value, interpolate the posted waiting time.
# Calculate the actual-over-posted ratio and prepare the training dataset.

groups = list()

for (date, attraction), group in tqdm(list(df_wait.groupby(['date', 'attraction']))):
    time_shifted = group.datetime - datetime.timedelta(hours = 3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Reindex the DataFrame
    # Set the 'minute' column as the index
    group = group.set_index('minute')

    # Interpolate SPOSTMIN to fill blanks
    group['SPOSTMIN_interp'] = group['SPOSTMIN'].interpolate(method='linear')

    # Calculate the actual over posted ratio
    group['actual_over_posted'] =  group['SACTMIN'] / group['SPOSTMIN_interp']

    # Keep only the features and targets for training
    group = group.drop(columns=['datetime', 'SACTMIN', 'SPOSTMIN', 'SPOSTMIN_interp']).dropna(subset=['actual_over_posted'])

    groups.append(group)

In [23]:
# Combine
training_dataset = pd.concat(groups, ignore_index=True)

In [None]:
training_dataset.sample(5)

In [25]:
# Save training dataset to csv
training_dataset.to_csv("../data/clean/training_dataset.csv", index=False)