In [26]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import os
import ipywidgets as widgets
from IPython.display import display
import datetime

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Hyperparameters & locations
</div>

In [27]:
locs = {'waiting_folder' : '../data/waiting times'}

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Reading the data
</div>

In [28]:
waiting_times = list()
for attraction in tqdm(os.listdir(locs['waiting_folder'])):
    filename = os.path.join(locs['waiting_folder'], attraction)
    df = pd.read_csv(filename)
    if df.empty:
        continue
    df.insert(0, 'attraction', attraction.split('.')[0])
    waiting_times.append(df)

df_wait_raw = pd.concat(waiting_times, ignore_index=True)

100%|██████████| 52/52 [00:17<00:00,  2.96it/s]


<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Cleaning the data
</div>

In [52]:
df_wait = df_wait_raw.copy()
# Cleaning the actual waiting times
# Removing outliers from actuals
df_wait = df_wait[((df_wait.SACTMIN >= -1000) & (df_wait.SACTMIN < 360)) | (df_wait.SACTMIN.isnull())]
# Removing outliers from posted (attraction closed at -999)
df_wait = df_wait[(df_wait.SPOSTMIN >= -998.99) | (df_wait.SPOSTMIN.isnull())]

df_wait['date'] = pd.to_datetime(df_wait.date, format = '%m/%d/%Y')
df_wait['datetime'] = pd.to_datetime(df_wait.datetime, format = '%Y-%m-%d %H:%M:%S')

print(f"Removed {len(df_wait_raw) - len(df_wait)} rows")

# You could split the dataset into two seperate dataframes (plusjes & minnetjes :-))
df_wait_act = df_wait[~df_wait.SACTMIN.isnull()].drop('SPOSTMIN', axis = 1)
df_wait_post = df_wait[~df_wait.SPOSTMIN.isnull()].drop('SACTMIN', axis = 1)

attractions = df_wait.attraction.unique()

Removed 1318703 rows


In [30]:
# Extract "minutes from df_wait" from the datetime column
df_wait_post['minute'] = df_wait_post['datetime'].dt.hour * 60 + df_wait_post['datetime'].dt.minute

In [31]:
df_wait_post['attraction'].unique()

array(['country_bears', '7_dwarfs_train', 'pirates_of_caribbean',
       'astro_orbiter', 'laugh_floor', 'regal_carrousel',
       'big_thunder_mtn', 'spaceship_earth', 'splash_mountain',
       'hall_of_presidents', 'toy_story_mania', 'space_mountain',
       'sorcerers_of_the_mk', 'jungle_cruise', 'mad_tea_party',
       'princess_hall__cinderella_elena', 'dumbo', 'tom_land_speedway',
       'swiss_family_tree', 'magic_carpets', 'tom_sawyer_island',
       'soarin', 'peoplemover', 'philharmagic', 'it_s_a_small_world',
       'kilimanjaro_safaris', 'expedition_everest', 'town_sq_mickey',
       'rock_n_rollercoaster', 'carousel_of_progress', 'under_the_sea',
       'dinosaur', 'barnstormer', 'flight_of_passage', 'winnie_the_pooh',
       'navi_river', 'enchanted_tiki_rm', 'princess_hall__rapunzel_tiana',
       'pirate_s_adventure', 'liberty_sq_riverboat', 'peter_pan_s_flight',
       'haunted_mansion', 'alien_saucers', 'buzz_lightyear', 'slinky_dog'],
      dtype=object)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    SPOSTMIN Resampling index to every 30 mins with interpolation
</div>

In [76]:
## This code starts from the posted waiting times. 
# Reindex and resample to a datapoint every 30 minutes. 
# Add leading and trailing zeros and fill gaps with interpolated values.

groups_post = list()

for (date, attraction), group in tqdm(df_wait_post.groupby(['date', 'attraction'])):
    # Time shift by 3 hours
    time_shifted = group['datetime'] - datetime.timedelta(hours=3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Drop unnecessary columns and compute the mean for duplicate minutes
    group = group.drop(columns=['datetime', 'attraction', 'date'], errors='ignore')  # Avoid KeyError
    group = group.groupby('minute', as_index=False)[['SPOSTMIN']].mean()

    # Define the new index (0 to 1620 minutes in 30-minute intervals)
    new_index = np.arange(0, 27 * 60 + 1, 30)

    # Set 'minute' as the index
    group = group.set_index('minute')
    
    # Find first and last valid indices in SPOSTMIN
    first_valid_index = group['SPOSTMIN'].first_valid_index()
    last_valid_index = group['SPOSTMIN'].last_valid_index()
    
    # Reindex while keeping all unique indices
    reindexed = group.reindex(np.unique(np.concatenate([new_index, group.index])))
    
    # Apply interpolation only between the first and last valid indices
    if first_valid_index is not None and last_valid_index is not None:
        reindexed.loc[first_valid_index:last_valid_index] = reindexed.loc[first_valid_index:last_valid_index].interpolate(method='linear')
    
    # Fill values before first and after last valid index with zeros
    reindexed.loc[:first_valid_index] = reindexed.loc[:first_valid_index].fillna(0)
    reindexed.loc[last_valid_index:] = reindexed.loc[last_valid_index:].fillna(0)

    # Keep only the required time indices
    resampled = reindexed.loc[new_index].reset_index()

    # Round to nearest 5-minute interval
    resampled['SPOSTMIN'] = (resampled['SPOSTMIN'] + 2.5) // 5 * 5

    # Insert date and attraction columns
    resampled.insert(0, 'date', date)
    resampled.insert(0, 'attraction', attraction)

    # Append to final list
    groups_post.append(resampled)


100%|██████████| 74858/74858 [30:57<00:00, 40.29it/s]   


In [81]:
# Combine
df_post_itp = pd.concat(groups_post, ignore_index=True)

In [82]:
df_post_itp.sample(5)

Unnamed: 0,attraction,date,minute,SPOSTMIN
628267,magic_carpets,2016-02-09,60,0.0
1206689,laugh_floor,2017-02-13,1320,0.0
912960,magic_carpets,2016-08-09,450,0.0
1773551,space_mountain,2018-01-04,630,110.0
2475863,big_thunder_mtn,2019-01-12,1140,50.0


In [83]:
# Save dataset with interpolates posted times to csv
df_post_itp.to_csv("../data/clean/posted_interpolated.csv", index=False)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    SACTMIN timeshift for minute column. No resampling or interpolation
</div>

In [35]:
## This code starts from the actual waiting times.
# Apply a time shift to get the minute column similar to the dataset with posted waiting times.

groups_act = list()

for (date, attraction), group in tqdm(df_wait_act.groupby(['date', 'attraction'])):
    # Time shift by 3 hours
    time_shifted = group['datetime'] - datetime.timedelta(hours=3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Append to final list
    groups_act.append(group)

100%|██████████| 57576/57576 [01:43<00:00, 558.52it/s]


In [36]:
df_act_timeshift = pd.concat(groups_act, ignore_index=True)

In [37]:
df_act_timeshift.drop(columns = ['datetime'])

Unnamed: 0,attraction,date,SACTMIN,minute
0,7_dwarfs_train,2015-01-01,54.0,485
1,7_dwarfs_train,2015-01-01,55.0,502
2,barnstormer,2015-01-01,18.0,639
3,big_thunder_mtn,2015-01-01,37.0,895
4,buzz_lightyear,2015-01-01,69.0,624
...,...,...,...,...
192367,toy_story_mania,2021-12-28,62.0,690
192368,toy_story_mania,2021-12-28,19.0,1242
192369,under_the_sea,2021-12-28,16.0,1155
192370,winnie_the_pooh,2021-12-28,16.0,530


In [21]:
# Safe dataset with timeshifted actuals to csv
df_act_timeshift.to_csv("../data/clean/actuals_shifted.csv", index=False)

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Interpolating and preparing training dataframe
</div>

In [53]:
## This code starts from the entire dataset of posted and actual waiting times.
# For every row where there is an actual value, interpolate the posted waiting time.
# Calculate the actual-over-posted ratio and prepare the training dataset.

groups = list()

for (date, attraction), group in tqdm(list(df_wait.groupby(['date', 'attraction']))):
    time_shifted = group.datetime - datetime.timedelta(hours = 3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Reindex the DataFrame
    # Set the 'minute' column as the index
    group = group.set_index('minute')

    # Interpolate SPOSTMIN to fill blanks
    group['SPOSTMIN_interp'] = group['SPOSTMIN'].interpolate(method='linear')

    # Calculate the actual over posted ratio
    group['actual_over_posted'] =  group['SACTMIN'] / group['SPOSTMIN_interp']

    # Keep only the features and targets for training
    group = group.drop(columns=['datetime', 'SACTMIN', 'SPOSTMIN', 'SPOSTMIN_interp']).dropna(subset=['actual_over_posted'])

    groups.append(group)

100%|██████████| 77543/77543 [12:02<00:00, 107.25it/s] 


In [77]:
group.to_csv("../data/clean/example_interpolation.csv", index=True)

In [78]:
# Combine
training_dataset = pd.concat(groups, ignore_index=False)

In [79]:
training_dataset.tail(5)

Unnamed: 0_level_0,attraction,date,actual_over_posted
minute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
690,toy_story_mania,2021-12-28,0.729412
1242,toy_story_mania,2021-12-28,0.690909
1155,under_the_sea,2021-12-28,0.533333
530,winnie_the_pooh,2021-12-28,0.64
818,winnie_the_pooh,2021-12-28,0.5


In [61]:
# Save training dataset to csv
training_dataset.to_csv("../data/clean/training_dataset.csv", index=True)