In [38]:
import os
import glob
import pandas as pd
import numpy as np
import scipy.stats
import plotly.express as px

In [39]:
file_path = r'C:/Users/Ryan/Documents/BootCamp/Final_Project/Disney_wait_times/Wait_Time_Data/'

In [40]:
# Find all CSV files containing data from file_path and add to a list

all_files = glob.glob(os.path.join(file_path, '*.csv'))

In [41]:
# Concatenate CSV files from list into a single dataframe

wait_time_df = pd.concat((pd.read_csv(f) for f in all_files),)

In [42]:
wait_time_df.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN,ride_name
0,1/1/2015,1/1/2015 7:51,,45.0,7_dwarfs_train
1,1/1/2015,1/1/2015 8:02,,60.0,7_dwarfs_train
2,1/1/2015,1/1/2015 8:05,54.0,,7_dwarfs_train
3,1/1/2015,1/1/2015 8:09,,60.0,7_dwarfs_train
4,1/1/2015,1/1/2015 8:16,,60.0,7_dwarfs_train


In [43]:
wait_time_df.dtypes

date          object
datetime      object
SACTMIN      float64
SPOSTMIN     float64
ride_name     object
dtype: object

In [44]:
wait_time_df = wait_time_df.rename(columns={'SACTMIN': 'sactmin', 'SPOSTMIN': 'spostmin'})

In [45]:
wait_time_est = wait_time_df.dropna(subset=['spostmin'])
wait_time_est.head()

Unnamed: 0,date,datetime,sactmin,spostmin,ride_name
0,1/1/2015,1/1/2015 7:51,,45.0,7_dwarfs_train
1,1/1/2015,1/1/2015 8:02,,60.0,7_dwarfs_train
3,1/1/2015,1/1/2015 8:09,,60.0,7_dwarfs_train
4,1/1/2015,1/1/2015 8:16,,60.0,7_dwarfs_train
6,1/1/2015,1/1/2015 8:23,,60.0,7_dwarfs_train


In [46]:
wait_time_actuals = wait_time_df.dropna(subset=['sactmin'])
wait_time_actuals.head()

Unnamed: 0,date,datetime,sactmin,spostmin,ride_name
2,1/1/2015,1/1/2015 8:05,54.0,,7_dwarfs_train
5,1/1/2015,1/1/2015 8:22,55.0,,7_dwarfs_train
177,1/2/2015,1/2/2015 12:20,160.0,,7_dwarfs_train
264,1/2/2015,1/2/2015 21:49,65.0,,7_dwarfs_train
281,1/2/2015,1/3/2015 0:44,19.0,,7_dwarfs_train


In [47]:
wait_time_est.count()

date         2345606
datetime     2345606
sactmin            0
spostmin     2345606
ride_name    2345606
dtype: int64

In [48]:
wait_time_actuals.count()

date         69883
datetime     69883
sactmin      69883
spostmin         0
ride_name    69883
dtype: int64

In [49]:
# Dropping columns in each dataframe that at this point contain all null values

wait_time_est = wait_time_est.drop(columns='sactmin')
wait_time_actuals = wait_time_actuals.drop(columns='spostmin')

In [50]:
# Rows coded as -999 in estimated wait times dropped (ride not in service)

wait_time_est = wait_time_est.drop(wait_time_est.loc[wait_time_est['spostmin'] == -999].index)

In [51]:
# Dropping extreme values that appear to be data entry mistakes/unrealistic wait times

wait_time_actuals = wait_time_actuals.drop(wait_time_actuals.loc[wait_time_actuals['sactmin'] == -92918].index)
wait_time_actuals = wait_time_actuals.drop(wait_time_actuals.loc[wait_time_actuals['sactmin'] == 1511].index)
wait_time_actuals = wait_time_actuals.drop(wait_time_actuals.loc[wait_time_actuals['sactmin'] == 952].index)


In [52]:
wait_time_est['spostmin'].describe()

count    1.412810e+06
mean     4.445371e+01
std      3.047277e+01
min      0.000000e+00
25%      2.000000e+01
50%      4.000000e+01
75%      6.000000e+01
max      3.300000e+02
Name: spostmin, dtype: float64

In [53]:
wait_time_actuals['sactmin'].describe()

count    69879.000000
mean        21.333290
std         16.842076
min          0.000000
25%          9.000000
50%         18.000000
75%         29.000000
max        217.000000
Name: sactmin, dtype: float64

In [54]:
# Converting datetime column to datetime data type

wait_time_actuals['datetime'] = pd.to_datetime(wait_time_actuals['datetime'])

In [55]:
wait_time_actuals.dtypes

date                 object
datetime     datetime64[ns]
sactmin             float64
ride_name            object
dtype: object

In [56]:
# Splitting year and month off of datetime for further EDA

wait_time_actuals['year'] = pd.DatetimeIndex(wait_time_actuals['datetime']).year
wait_time_actuals['month'] = pd.DatetimeIndex(wait_time_actuals['datetime']).month

In [57]:
wait_time_actuals.head()

Unnamed: 0,date,datetime,sactmin,ride_name,year,month
2,1/1/2015,2015-01-01 08:05:00,54.0,7_dwarfs_train,2015,1
5,1/1/2015,2015-01-01 08:22:00,55.0,7_dwarfs_train,2015,1
177,1/2/2015,2015-01-02 12:20:00,160.0,7_dwarfs_train,2015,1
264,1/2/2015,2015-01-02 21:49:00,65.0,7_dwarfs_train,2015,1
281,1/2/2015,2015-01-03 00:44:00,19.0,7_dwarfs_train,2015,1


In [58]:
wait_time_actuals.shape

(69879, 6)

In [59]:
wait_time_actuals.groupby('year')['sactmin'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015,9517.0,18.861301,15.834835,0.0,8.0,15.0,25.0,160.0
2016,8164.0,19.095664,16.166044,0.0,8.0,16.0,26.0,192.0
2017,7428.0,20.982633,17.842144,0.0,9.0,17.0,28.0,160.0
2018,7872.0,22.07279,17.963936,0.0,10.0,18.0,29.0,188.0
2019,7527.0,22.885213,19.071661,0.0,10.0,19.0,30.0,217.0
2020,4684.0,24.743809,18.281827,0.0,12.0,21.0,32.0,193.0
2021,24687.0,21.775671,15.477076,0.0,10.0,19.0,30.0,141.0


In [60]:
wait_time_actuals.groupby('ride_name')['sactmin'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ride_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7_dwarfs_train,7634.0,36.166623,22.950125,0.0,21.0,31.0,46.0,217.0
big_thunder_mtn,11729.0,18.251513,12.913191,0.0,9.0,16.0,25.0,125.0
haunted_mansion,10886.0,16.264193,12.53142,0.0,7.0,13.0,22.0,108.0
it_s_a_small_world,7105.0,13.604785,10.813506,0.0,5.0,12.0,19.0,90.0
peter_pan_s_flight,5356.0,24.087005,15.670876,0.0,14.0,21.0,31.0,192.0
pirates_of_caribbean,11589.0,18.167141,13.013274,0.0,8.0,16.0,26.0,101.0
space_mountain,8705.0,24.409075,17.81183,0.0,12.0,21.0,33.0,193.0
splash_mountain,6875.0,25.430836,18.749842,0.0,11.0,22.0,36.0,156.0


In [61]:
wait_time_actuals['ride_name'].value_counts()

big_thunder_mtn         11729
pirates_of_caribbean    11589
haunted_mansion         10886
space_mountain           8705
7_dwarfs_train           7634
it_s_a_small_world       7105
splash_mountain          6875
peter_pan_s_flight       5356
Name: ride_name, dtype: int64

In [62]:
# Distribution of actual wait time data from 2015 to 2021 after dropping three impossible values above

fig = px.histogram(wait_time_actuals, x='sactmin', title='Distribution of Wait Times for Eight Magic Kingdom Rides(2015-2021)')
fig.show()

In [63]:
normal_test = scipy.stats.shapiro
norm_test_act = wait_time_actuals.groupby('year')['sactmin'].apply(normal_test)
norm_test_act2 = norm_test_act.apply(pd.Series, index=['stat', 'p'])
norm_test_act2['normal'] = np.where(norm_test_act2['p']<0.05, 'not normal', 'normal')

norm_test_act2



p-value may not be accurate for N > 5000.



Unnamed: 0_level_0,stat,p,normal
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,0.860712,0.0,not normal
2016,0.843426,0.0,not normal
2017,0.829126,0.0,not normal
2018,0.83317,0.0,not normal
2019,0.834634,0.0,not normal
2020,0.879533,0.0,not normal
2021,0.925189,0.0,not normal


In [64]:
Q1 = wait_time_actuals['sactmin'].quantile(0.25)
Q3 = wait_time_actuals['sactmin'].quantile(0.75)
IQR = Q3 - Q1

In [65]:
outliers = wait_time_actuals['sactmin'] > (Q3 + 1.5 * IQR)
outliers.value_counts()


False    67577
True      2302
Name: sactmin, dtype: int64