In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [3]:
## These are for the interactive plot in this notebook.
# You may have to install the packages first (use pip or conda install)
# Also fine to skip it, if you don't want to run the plot.

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

<div style="background-color: rgba(0, 176, 176, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Load data
</div>

In [None]:
## load original dataset with interpolated SPOSTMIN
df_itp = pd.read_csv('../data/clean/posted_interpolated.csv')
print(len(df_itp))
df_itp.head()

In [None]:
## load original dataset SACTMIN
df_act = pd.read_csv('../data/clean/actuals_shifted.csv')
print(len(df_act))
df_act.head()

In [None]:
## load training dataest
df = pd.read_csv('../data/clean/training_dataset.csv')
print(len(df))
df.head()


In [None]:
## Load metadata
df_meta = pd.read_csv('../data/overview data/metadata.csv', sep = ',', usecols=['DATE','DAYOFWEEK','DAYOFYEAR','WEEKOFYEAR','MONTHOFYEAR','YEAR','SEASON'])
df_meta.sample(5)

<div style="background-color: rgba(0, 176, 176, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Cleaning - Merging
</div>

In [None]:
## Check quality of actual_over_posted values
print(f"NaN count: {df['actual_over_posted'].isna().sum()}")  
print(f"Zero count: {df['actual_over_posted'].isnull().sum()}")
print(f"Inf count: {(~np.isfinite(df['actual_over_posted'])).sum()}")

In [None]:
## cleaning: remove 'inf' in actual_over_posted
pre = len(df)
df = df[np.isfinite(df['actual_over_posted'])]
print(f"Removed {pre - len(df)} rows.")

In [10]:
# ## Fill NaN with the value 'NONE'
# df_meta['SEASON']=df_meta['SEASON'].fillna(value='NO_SEASON')

In [11]:
# # Merge df_meta into df
# merged_df = pd.merge(df, df_meta, left_on='date', right_on='DATE', how='inner')
# print(len(df))
# len(merged_df)

In [12]:
# Change format of date columns
df['date'] = pd.to_datetime(df['date'])
df_itp['date'] = pd.to_datetime(df_itp['date'])
df_act['date'] = pd.to_datetime(df_act['date'])
df_meta['DATE'] = pd.to_datetime(df_meta['DATE'], format='%m/%d/%Y')

In [13]:
# Function to extract Features
def extract_features(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['quarter'] = df['date'].dt.quarter
    df['day_of_year'] = df['date'].dt.dayofyear
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    return df

In [None]:
# Extract features of the dataframe
extract_features(df)
df.sample(5)

In [None]:
# Extract features of the interpolated df
extract_features(df_itp)
df_itp.sample(5)

In [None]:
# Extract features of the actuals df
extract_features(df_act)
df_act.sample(5)

<div style="background-color: rgba(0, 176, 176, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Basic visualization
</div>

In [None]:
## How many target datapoints do we have per attraction?
posted_count = df.groupby(["attraction", "date"])["actual_over_posted"].count().reset_index()

avg_posted_count = posted_count.groupby("attraction")["actual_over_posted"].mean().reset_index()

plt.figure(figsize=(15, 6))
plt.bar(avg_posted_count["attraction"], avg_posted_count["actual_over_posted"])

plt.xlabel("Attracties")
plt.ylabel("Gegevenspunten")
plt.xticks(rotation=90)

plt.show()

<div style="background-color: rgba(0, 176, 176, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Model training
</div>

In [18]:
X = pd.get_dummies(df.drop(['actual_over_posted','date'],axis=1),drop_first=True)
y = df['actual_over_posted']

In [None]:
X.head(10)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [21]:
# Function to fit a model and print metrics
def run_model(model,X_train,y_train,X_test,y_test):
    # Fitten van het Model
    model.fit(X_train,y_train)
    
    # Bereken de Metrics    
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test,preds))
    print(f'MAE : {mae}')
    print(f'RMSE : {rmse}')

In [None]:
## Let's try linear regression
model_lr = LinearRegression()
run_model(model_lr,X_train,y_train,X_test,y_test)

In [None]:
## Let's try random forest
model_rf = RandomForestRegressor(n_estimators=10)    
run_model(model_rf,X_train,y_train,X_test,y_test)

In [None]:
## Let's try Adaboost
model_ada = AdaBoostRegressor()
run_model(model_ada,X_train,y_train,X_test,y_test)

In [25]:
## Choose one of the models above to continue with?
model = model_lr        # Adapt as you prefer. 

<div style="background-color: rgba(0, 176, 176, 1); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Check results
</div>

In [26]:
# prepare a dataframe with the same feature columns as the test set, selecting from the original dataset
df_itp_dummies = pd.get_dummies(df_itp)
feature_names = X.columns
df_features = df_itp_dummies[[col for col in feature_names if col in df_itp_dummies.columns]]


In [27]:
# Use the trained model to predict the ratio
df_itp['predicted_ratio'] = model.predict(df_features)
# Then predict the actual times by multiplying SPOSTMIN with the predicted ratio
df_itp['predicted_actual_time'] = df_itp['predicted_ratio'] * df_itp['SPOSTMIN']

In [None]:
# Concatenate the actuals
concatenated_df = pd.concat([df_itp, df_act.drop(columns=['datetime'])], ignore_index=True, sort=False)
# Reindexing the DataFrame
concatenated_df = concatenated_df.reset_index(drop=True)

concatenated_df.sample(5)

In [None]:
# Initialize Dash app
app = dash.Dash(__name__)

# App layout
app.layout = html.Div([
    html.H1("Predicted Waiting Times", style={'textAlign': 'center'}),
    
    # Dropdown for selecting Year
    html.Label("Select year:"),
    dcc.Dropdown(
        id='year-dropdown',
        options=[{'label': year, 'value': year} for year in concatenated_df['year'].unique()],
        value=concatenated_df['year'].unique()[0],  # Default to first date
        clearable=False
    ),
        
    # Dropdown for selecting Month
    html.Label("Select month:"),
    dcc.Dropdown(
        id='month-dropdown',
        options=[{'label': month, 'value': month} for month in concatenated_df['month'].unique()],
        value=concatenated_df['month'].unique()[0],  # Default to first date
        clearable=False
    ),    
    
    # Dropdown for selecting Day
    html.Label("Select day:"),
    dcc.Dropdown(
        id='day-dropdown',
        options=[{'label': day, 'value': day} for day in concatenated_df['day'].unique()],
        value=concatenated_df['day'].unique()[0],  # Default to first date
        clearable=False
    ),
    
    # Dropdown for selecting Attraction
    html.Label("Select Attraction:"),
    dcc.Dropdown(
        id='attraction-dropdown',
        options=[{'label': attraction, 'value': attraction} for attraction in concatenated_df['attraction'].unique()],
        value=concatenated_df['attraction'].unique()[0],  # Default to first attraction
        clearable=False
    ),
    
    # Graph output
    dcc.Graph(id='scatter-plot')
])

# Callback to update graph based on selections
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('year-dropdown', 'value'),
     Input('month-dropdown', 'value'),
     Input('day-dropdown', 'value'),
     Input('attraction-dropdown', 'value')]
)
def update_graph(selected_year, selected_month, selected_day, selected_attraction):
    # Filter dataset
    filtered_df = concatenated_df.query("year == @selected_year \
                               & month == @selected_month \
                               & day == @selected_day \
                               & attraction == @selected_attraction")
    
    # Create scatter plot
    fig = px.scatter(
        filtered_df,
        x='minute',
        y=['SPOSTMIN','SACTMIN'],
        labels={'value': "Wait Time (Minutes)", 'minute': "Minute of the Day"},
        title=f"Wait Time for {selected_attraction} on {selected_year} - {selected_month} - {selected_day}"
    )

    # Add predicted_ratio curve
    fig.add_scatter(x=filtered_df['minute'], y=filtered_df['predicted_actual_time'],
                    mode='lines', name='Predicted', line=dict(color='green', width=2))
    
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


In [34]:
## Prepare export dataset
# Select required columns
export_df = concatenated_df[['attraction', 'date', 'minute', 'SPOSTMIN', 'SACTMIN', 'predicted_actual_time']]

In [35]:
# Export as csv
export_df.to_csv('../data/clean/waiting_times_with_prediction.csv')