In [1]:
import pandas as pd
import plotly.express as px
import pyarrow.parquet as pq
import os
import json
from io import StringIO

# --- Configuration ---
# The path to your uploaded Parquet file, pointing to the "Data" subfolder.
FILE_PATH = os.path.join('Data', '20250820163000_stream.tomtom.analyze-sail.parquet')
# We will take a 10% sample of the data to avoid memory issues.
SAMPLE_FRACTION = 0.1

# --- Helper Function to Decode Data ---
def parse_tomtom_data(value_string):
    """
    Parses the JSON string from the '_value' column, and then
    parses the CSV data contained within it.
    """
    try:
        # The value is not hex, but a string that contains a JSON object.
        # Find the start of the JSON object.
        start_index = value_string.find('{')
        if start_index == -1:
            return None # No JSON object found

        # Parse the JSON string into a Python dictionary
        data = json.loads(value_string[start_index:])
        
        # The actual traffic data is in a CSV formatted string in the 'data' field.
        if 'data' in data and isinstance(data['data'], str):
            csv_string = data['data']
            # Use pandas to read the CSV string directly into a DataFrame
            # StringIO treats the string like a file in memory.
            return pd.read_csv(StringIO(csv_string))
            
    except (json.JSONDecodeError, KeyError):
        # Ignore rows that can't be decoded or don't have the right structure.
        return None
    return None


# --- 1. Load the Data & Process ---
try:
    print(f"Loading a {SAMPLE_FRACTION*100}% sample from {FILE_PATH} to conserve memory...")
    parquet_file = pq.ParquetFile(FILE_PATH)

    list_of_sampled_dfs = []
    for batch in parquet_file.iter_batches():
        chunk_df = batch.to_pandas()
        list_of_sampled_dfs.append(chunk_df.sample(frac=SAMPLE_FRACTION))

    df_raw = pd.concat(list_of_sampled_dfs, ignore_index=True)
    print("Raw data loaded and sampled successfully!")

    # --- 2. Parse the '_value' column ---
    print("Parsing traffic data from the '_value' column...")
    
    # Apply the parsing function. This will return a list of small DataFrames.
    parsed_dfs = df_raw['_value'].apply(parse_tomtom_data)
    
    # Drop any rows where parsing failed
    parsed_dfs.dropna(inplace=True)
    
    # Concatenate the list of DataFrames into one single, clean DataFrame
    df = pd.concat(parsed_dfs.tolist(), ignore_index=True)

    print(f"Successfully parsed {len(df):,} road segment data points.")
    print("\nFirst 5 rows of the parsed data:")
    print(df.head())
    print("\nData columns:")
    print(df.columns)


    # --- 3. Create the Visualization ---
    # Since we have traffic levels for road segments (not lat/lon points),
    # a histogram is the best way to visualize the overall congestion.
    print("\nGenerating traffic level histogram...")
    fig_histogram = px.histogram(
        df,
        x='traffic_level',
        nbins=50, # We can adjust the number of bins for more/less detail
        title='Distribution of Traffic Levels in Amsterdam (Aug 20th, 2025)',
        labels={'traffic_level': 'Traffic Level (0 = No congestion, 1 = Max congestion)'},
        height=600
    )

    fig_histogram.update_layout(
        bargap=0.1,
        xaxis_title="Traffic Level",
        yaxis_title="Number of Road Segments"
    )

    # --- 4. Show the Histogram ---
    # This will open the interactive plot in your web browser.
    fig_histogram.show()

    print("\nVisualization generated. Check your web browser.")
    print("\nNOTE: To create a map, a separate file mapping 'id' to geographical shapes is needed.")


except FileNotFoundError:
    # This block is kept for robust error handling
    print(f"--- ERROR: File Not Found ---")
    print(f"The script could not find the file: '{FILE_PATH}'")
    try:
        current_directory = os.getcwd()
        print(f"\nThe script is currently running in this directory: {current_directory}")
        files_in_directory = os.listdir(current_directory)
        print("\nFiles in directory:", files_in_directory)
    except Exception as e:
        print(f"\nCould not list directory contents: {e}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")



Loading a 10.0% sample from Data\20250820163000_stream.tomtom.analyze-sail.parquet to conserve memory...
Raw data loaded and sampled successfully!
Parsing traffic data from the '_value' column...
Successfully parsed 193,356 road segment data points.

First 5 rows of the parsed data:
          id  traffic_level
0  212397051           0.43
1  213398007           0.47
2  210398096           0.44
3  210398119           0.50
4  211398030           0.50

Data columns:
Index(['id', 'traffic_level'], dtype='object')

Generating traffic level histogram...



Visualization generated. Check your web browser.

NOTE: To create a map, a separate file mapping 'id' to geographical shapes is needed.
