In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import ipywidgets as widgets
from IPython.display import display
import datetime

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Hyperparameters & locations
</div>

In [4]:
locs = {'waiting_folder' : '../data/waiting times'}

<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Reading the data
</div>

In [5]:
waiting_times = list()
for attraction in tqdm(os.listdir(locs['waiting_folder'])):
    filename = os.path.join(locs['waiting_folder'], attraction)
    df = pd.read_csv(filename)
    if df.empty:
        continue
    df.insert(0, 'attraction', attraction.split('.')[0])
    waiting_times.append(df)

df_wait_raw = pd.concat(waiting_times, ignore_index=True)

100%|██████████| 52/52 [00:06<00:00,  7.81it/s]


<div style="background-color: rgba(0, 176, 240, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Cleaning the data
</div>

In [6]:
df_wait = df_wait_raw.copy()
# Cleaning the actual waiting times
# Removing outliers from actuals
df_wait = df_wait[((df_wait.SACTMIN >= -1000) & (df_wait.SACTMIN < 360)) | (df_wait.SACTMIN.isnull())]
# Removing outliers from posted (attraction closed at -999)
df_wait = df_wait[(df_wait.SPOSTMIN >= -998.99) | (df_wait.SPOSTMIN.isnull())]

df_wait['date'] = pd.to_datetime(df_wait.date, format = '%m/%d/%Y')
df_wait['datetime'] = pd.to_datetime(df_wait.datetime, format = '%Y-%m-%d %H:%M:%S')

print(f"Removed {len(df_wait_raw) - len(df_wait)} rows")

# You could split the dataset into two seperate dataframes (plusjes & minnetjes :-))
df_wait_act = df_wait[~df_wait.SACTMIN.isnull()].drop('SPOSTMIN', axis = 1)
df_wait_post = df_wait[~df_wait.SPOSTMIN.isnull()].drop('SACTMIN', axis = 1)

attractions = df_wait.attraction.unique()

Removed 1318703 rows


In [7]:
# Extract "minutes from df_wait" from the datetime column
df_wait_post['minute'] = df_wait_post['datetime'].dt.hour * 60 + df_wait_post['datetime'].dt.minute

In [13]:
df_wait_post['attraction'].unique()

array(['7_dwarfs_train', 'alien_saucers', 'astro_orbiter', 'barnstormer',
       'big_thunder_mtn', 'buzz_lightyear', 'carousel_of_progress',
       'country_bears', 'dinosaur', 'dumbo', 'enchanted_tiki_rm',
       'expedition_everest', 'flight_of_passage', 'hall_of_presidents',
       'haunted_mansion', 'it_s_a_small_world', 'jungle_cruise',
       'kilimanjaro_safaris', 'laugh_floor', 'liberty_sq_riverboat',
       'mad_tea_party', 'magic_carpets', 'navi_river', 'peoplemover',
       'peter_pan_s_flight', 'philharmagic', 'pirates_of_caribbean',
       'pirate_s_adventure', 'princess_hall__cinderella_elena',
       'princess_hall__rapunzel_tiana', 'regal_carrousel',
       'rock_n_rollercoaster', 'slinky_dog', 'soarin',
       'sorcerers_of_the_mk', 'spaceship_earth', 'space_mountain',
       'splash_mountain', 'swiss_family_tree', 'tom_land_speedway',
       'tom_sawyer_island', 'town_sq_mickey', 'toy_story_mania',
       'under_the_sea', 'winnie_the_pooh'], dtype=object)

In [10]:
from tqdm import tqdm
groups = list()
for date, group in tqdm(list(df_wait_post.groupby('date'))):
    time_shifted = group.datetime - datetime.timedelta(hours = 3)
    group['minute'] = 3 * 60 + time_shifted.dt.hour * 60 + time_shifted.dt.minute

    # Average out duplicate minutes
    group = group.drop(['datetime', 'attraction', 'date'], axis = 1).groupby(['minute'])['SPOSTMIN'].mean().reset_index()

    new_index = np.arange(0, 27 * 60 + 1, 30)  # Range from 0 to 1620 with a step of 30

    # Step 2: Reindex the DataFrame
    # Set the 'minute' column as the index
    group = group.set_index('minute')

    # Step 3: Reindex to the new index and interpolate
    resampled = group.reindex(np.unique(np.concatenate([new_index, group.index]))).interpolate(method='linear')
    resampled = resampled.loc[new_index]

    # Step 4: Add zeros at the endpoints
    #resampled.loc[0] = 0  # Set the first value to zero
    #resampled.loc[1620] = 0  # Set the last value to zero

    # Step 5: Reset index if needed
    resampled = resampled.reset_index()

    resampled['SPOSTMIN'] = resampled['SPOSTMIN'].fillna(0)
    resampled['SPOSTMIN'] = (resampled['SPOSTMIN'] + 2.5) // 5 * 5
    resampled.insert(0, 'date', date)
    resampled.insert(0, 'attraction', attraction)
    groups.append(resampled)

100%|██████████| 2335/2335 [00:11<00:00, 210.71it/s]


In [11]:
df_netjes = pd.concat(groups, ignore_index=True)

In [12]:
df_netjes.to_csv('dataset_Disney_clean.csv')