In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

from datetime import datetime       
from environment import dh, pio_renderer
from importlib.metadata import version
from scipy.interpolate import UnivariateSpline

In [None]:
if pio_renderer is not None:    
    pio.renderers.default = pio_renderer

In [None]:
PROJECT_NAME = "AreaVerde"  

In [None]:
#NOTE; To load the data from the Digital Hub the version of the Digital Hub must be 0.8.1    
version('digitalhub')       

In [None]:
# ===========
# Load data
# ===========
project = dh.get_or_create_project(PROJECT_NAME)

gates = project.get_dataitem("gates").as_df()
data = project.get_dataitem("gate_data").as_df()

# Join coordinates
gate_data = pd.merge(data, gates, on="gate", how="left", validate="many_to_one").drop(columns=["ID", "Indirizzo", "Settore", "Link google maps"])
# reconvert string to datetime
gate_data["Data"] = pd.to_datetime(gate_data["Data"])

## **Gate Similarity**

In [None]:
hourly_total_vehicle = gate_data[["Data", "gate", "count"]].groupby(["Data", "gate"], as_index=False).sum()
hourly_total_vehicle = hourly_total_vehicle.pivot_table(index="Data", columns="gate", values="count", fill_value=0).astype(int)
average_hourly_vehicle_flow = hourly_total_vehicle.groupby(hourly_total_vehicle.index.hour).mean().astype(int)


In [None]:
# These are the gates that we do not have data for the selected period.
problematic_gates = [
    'Colombo', 'Della Pietra', 'Di Vittorio',
    'San Mamolo', 'Terrapieno', 'Togliatti', 'Toscana'
]

In [None]:
problematic_hourly_vehicle_flow = average_hourly_vehicle_flow[problematic_gates]

# removed problematic gates and we are going to use this data as a reference
average_hourly_vehicle_flow_ref = average_hourly_vehicle_flow.drop(columns=problematic_gates)



In [None]:
import pandas as pd
from scipy.spatial.distance import euclidean

similarity = {}
df_data = []

n_problematic = len(problematic_hourly_vehicle_flow.columns)
n_ok = len(average_hourly_vehicle_flow_ref.columns)
mat_distances = np.zeros(shape=(n_problematic, n_ok))
i = 0


for problematic_gate in problematic_hourly_vehicle_flow.columns:
    gate_distances = []
    
    j = 0
    for gate in average_hourly_vehicle_flow_ref.columns:
        gate_vector = average_hourly_vehicle_flow_ref[gate].values
        problematic_gate_vector = problematic_hourly_vehicle_flow[problematic_gate].values
        dist = euclidean(gate_vector, problematic_gate_vector)
        gate_distances.append((gate, dist))

        mat_distances[i,j] = dist
        j += 1
    
    min_distance_gate = min(gate_distances, key=lambda x: x[1])
    similarity[problematic_gate] = min_distance_gate[0]
    df_data.append([problematic_gate, min_distance_gate[0], min_distance_gate[1]])

    i += 1

min_distances_df = pd.DataFrame(df_data, columns=['Problematic Gate', 'Reference Gate', 'Euclidean Distance'])



In [None]:
#See the minimum distances
min_distances_df

In [None]:
# Check the heatmap of all the distances
import seaborn as sns
sns.heatmap(mat_distances)

## **In-flow Estimation**

In [None]:
# ============
# Filter dates
# ============
start_date = datetime(2024,6,1)
end_date = datetime(2024,7,31)

In [None]:
hourly_total_vehicle = gate_data[["Data", "gate", "count"]].groupby(["Data", "gate"], as_index=False).sum()
hourly_total_vehicle = hourly_total_vehicle.pivot_table(index="Data", columns="gate", values="count", fill_value=0).astype(int)
hourly_vehicle_data = hourly_total_vehicle[start_date:end_date]
average_hourly_vehicle_flow = hourly_vehicle_data.groupby(hourly_vehicle_data.index.hour).mean().astype(int)

In [None]:
#NOTE:we do not have data for selected time for some gates.
(average_hourly_vehicle_flow==0).sum()

In [None]:
def predict_missing_gate(df, similar_gates, smoothness=1.5):
    time = np.arange(0, 24, 1)
    for target_gate, source_gate in similar_gates.items():
        data = df[source_gate].values.flatten()
        spline = UnivariateSpline(time, data, s=smoothness)
        df[target_gate] = spline(time)
    return df

In [None]:
# =========================================
# we need to extract data for every 5 min
# =========================================
def extend_to_5min(df):
    to_extract = df.copy()
    to_extract[24] = to_extract[0]
    to_extract.index = to_extract.index*12
    to_extract = to_extract.reindex(range(12*24+1)).interpolate(method='quadratic')
    if to_extract.min() < 0:
        to_extract = to_extract - to_extract.min()
    to_extract = to_extract /12
    to_extract = to_extract[:-1]
    
    return to_extract.values

In [None]:
# Mapping of gates with missing values as a key to their similar gates as values.
# This similarity is based on the pattern of vehicle flow throughout the day beyond the date range we considered above.
predicted_average_hourly_vehicle_flow = predict_missing_gate(average_hourly_vehicle_flow,similarity)


In [None]:
average_hourly_vehicle_flow_filtered = predicted_average_hourly_vehicle_flow.astype(int)
average_hourly_vehicle_flow_filtered_sumed = average_hourly_vehicle_flow_filtered.sum(axis=1)

In [None]:
# ======================================
# Plotting the avereged in-flow traffic 
# ======================================
total_flow = extend_to_5min(average_hourly_vehicle_flow_filtered_sumed)

fig = px.line(total_flow)
fig.update_layout(
    xaxis=dict(
        dtick=10
    )
)
fig.show()

In [None]:
import pandas as pd

holiday_list = ['2024-01-01', '2024-01-06', '2024-03-31', '2024-04-01',
                '2024-04-25', '2024-05-01', '2024-06-02', '2024-08-15',
                '2024-11-01', '2024-12-08', '2024-12-25', '2024-12-26']

holidays = pd.to_datetime(holiday_list).date
hourly_vehicle = hourly_vehicle_data.reset_index()
hourly_vehicle['Data'] = pd.to_datetime(hourly_vehicle['Data'])
hourly_vehicle['Hour'] = hourly_vehicle['Data'].dt.hour

hourly_vehicle['DayType'] = 'Weekday' 
hourly_vehicle.loc[hourly_vehicle['Data'].dt.weekday == 5, 'DayType'] = 'Saturday'  
hourly_vehicle.loc[hourly_vehicle['Data'].dt.weekday == 6, 'DayType'] = 'Sunday' 
hourly_vehicle.loc[hourly_vehicle['Data'].dt.date.isin(holidays), 'DayType'] = 'Holiday'  


In [None]:

day_types = ['Weekday', 'Saturday', 'Sunday', 'Holiday']
plot_df = pd.DataFrame()
flows_by_day = {}

for day in day_types:
    day_data = hourly_vehicle[hourly_vehicle['DayType'] == day]
    numeric_columns = hourly_vehicle.columns.difference(['Data', 'Hour', 'DayType'])
    hourly_avg = day_data.groupby('Hour')[numeric_columns].mean().astype(int)
    predicted_hourly_avg = predict_missing_gate(hourly_avg, similarity)
    to_extract = pd.Series(
    extend_to_5min(predicted_hourly_avg.sum(axis=1)),
    index=range(0, 12 * 24)
    )
    flows_by_day[day] = to_extract
    df = pd.DataFrame({         
        "Minute": to_extract.index,
        "Flow": to_extract.values,
        "DayType": day
    })

    plot_df = pd.concat([plot_df, df], ignore_index=True)

fig = px.line(plot_df, x='Minute', y='Flow', color='DayType', title='Traffic Flow by Day Type')
fig.show()


In [None]:
combined_flow = (5/7) * flows_by_day['Weekday'] + (1/7) * flows_by_day['Saturday'] + (1/7) * flows_by_day['Holiday']


In [None]:
import plotly.graph_objects as go

x = list(range(0, 12 * 24))  

fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=flows_by_day['Weekday'], mode='lines', name='Weekday'))
fig.add_trace(go.Scatter(x=x, y=flows_by_day['Saturday'], mode='lines', name='Saturday'))
fig.add_trace(go.Scatter(x=x, y=flows_by_day['Holiday'], mode='lines', name='Holiday'))
fig.add_trace(go.Scatter(x=x, y=total_flow, mode='lines', name='total_flow'))
fig.add_trace(go.Scatter(x=x, y=combined_flow, mode='lines', name='WeightedAvg', line=dict(dash='dash')))

fig.update_layout(title='Traffic Flow Comparison', xaxis_title='5-min Intervals', yaxis_title='Flow')
fig.show()
