In [3]:
import pandas as pd
import numpy as np
import sqlalchemy
import matplotlib.pyplot as plt
import plotly.express as px
import csv
from pathlib import Path

In [10]:
BASE = Path().resolve().parent

DATA_RAW   = BASE / "data_raw"
DATA_CLEAN = BASE / "data_clean"

print("RAW exists:", DATA_RAW.exists(), "→", DATA_RAW)
print("CLEAN exists:", DATA_CLEAN.exists(), "→", DATA_CLEAN)

RAW exists: True → /Users/milan91/Theme_Parks/data_raw
CLEAN exists: True → /Users/milan91/Theme_Parks/data_clean


In [12]:
# Helper to detect delimiter
def sniff_sep(path):
    sample = Path(path).read_text(encoding="utf-8", errors="replace")[:2048]
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=[',',';','\t','|'])
        return dialect.delimiter
    except:
        return ','

In [11]:
# 1. Load cleaned Parquets
dim_park        = pd.read_parquet(DATA_CLEAN/"dim_park.parquet")
dim_ride        = pd.read_parquet(DATA_CLEAN/"dim_ride.parquet")
fact_attendance = pd.read_parquet(DATA_CLEAN/"fact_attendance.parquet")
fact_wait       = pd.read_parquet(DATA_CLEAN/"fact_wait.parquet")

In [35]:
sep = sniff_sep(DATA_RAW / "weather_data.csv")
weather = pd.read_csv(DATA_RAW / "weather_data.csv", sep=sep, encoding="utf-8")
weather.columns = weather.columns.str.strip().str.upper()

In [37]:
# 3. Keep only "YYYY-MM-DD HH:MM:SS" (first 19 chars)
weather['DT_CLEAN'] = weather['DT_ISO'].str.slice(0, 19)

# 4. Parse that into a datetime
weather['DATETIME'] = pd.to_datetime(
    weather['DT_CLEAN'],
    format="%Y-%m-%d %H:%M:%S",
    errors="coerce"
)

# 5. Extract the date for joins
weather['DATE'] = weather['DATETIME'].dt.date

In [54]:
weather.head()

Unnamed: 0,DT,DT_ISO,TIMEZONE,CITY_NAME,LAT,LON,TEMP,VISIBILITY,DEW_POINT,FEELS_LIKE,...,SNOW_1H,SNOW_3H,CLOUDS_ALL,WEATHER_ID,WEATHER_MAIN,WEATHER_DESCRIPTION,WEATHER_ICON,DT_CLEAN,DATETIME,DATE
0,915148800,1999-01-01 00:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.33,,3.39,5.28,...,,,8,800,Clear,sky is clear,01n,1999-01-01 00:00:00,1999-01-01 00:00:00,1999-01-01
1,915152400,1999-01-01 01:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,3.54,5.18,...,,,6,800,Clear,sky is clear,01n,1999-01-01 01:00:00,1999-01-01 01:00:00,1999-01-01
2,915156000,1999-01-01 02:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,4.11,5.38,...,,,14,801,Clouds,few clouds,02n,1999-01-01 02:00:00,1999-01-01 02:00:00,1999-01-01
3,915159600,1999-01-01 03:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,7.31,,3.73,4.42,...,,,39,802,Clouds,scattered clouds,03n,1999-01-01 03:00:00,1999-01-01 03:00:00,1999-01-01
4,915163200,1999-01-01 04:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,6.91,,3.53,4.0,...,,,52,803,Clouds,broken clouds,04n,1999-01-01 04:00:00,1999-01-01 04:00:00,1999-01-01


In [46]:
# Merge attendance with parks and weather
att_pd = fact_attendance.merge(dim_park, on="PARK_ID", how="left")

In [55]:
att_pd.head()

Unnamed: 0,PARK_ID,DATE,ATTENDANCE,PARK
0,2,2018-06-01,46804,PortAventura World
1,1,2018-06-01,20420,Tivoli Gardens
2,2,2018-06-02,57940,PortAventura World
3,1,2018-06-02,29110,Tivoli Gardens
4,2,2018-06-03,44365,PortAventura World


In [57]:
#Capacity Utilization

In [58]:
fact_wait["UTIL"] = fact_wait["GUEST_CARRIED"] / fact_wait["CAPACITY"]
util = (
    fact_wait.groupby("RIDE_ID")["UTIL"]
    .mean()
    .reset_index()
    .merge(dim_ride[["RIDE_ID","RIDE"]], on="RIDE_ID")
)
top10 = util.sort_values("UTIL", ascending=False).head(10)