In [2]:
import polars as pl
import plotly.express as px
import os
#  Dataset has 10 unique customers & locations, 92 unique customer/locatio pairs
#  dropped the HOUR and MINUTE fields, data grouped by ID, DATE, LONG/LAT
#----- LOAD AND CLEAN THE DATASET


map_styles = [
    'basic', 'carto-darkmatter', 'carto-darkmatter-nolabels', 'carto-positron', 
    'carto-positron-nolabels', 'carto-voyager', 'carto-voyager-nolabels', 
    'dark', 'light', 'open-street-map', 'outdoors', 'satellite', 
    'satellite-streets', 'streets', 'white-bg']
my_map_style=map_styles[9]

df_locations = pl.read_excel('df_locations.xlsx')

if 'df.parquet' in os.listdir('.'):
    print('reading dataset from parquet file')
    df = pl.read_parquet('df.parquet')
else:
    print('reading dataset from csv file')
    df  = (
        pl.scan_csv('pistes-cyclables-2024.csv')
        .select(
            ID = pl.col('id_compteur').cast(pl.UInt32),
            DATE = pl.col('date').str.to_date(format='%m/%d/%Y'),
            LON = pl.col('longitude').mean().over('id_compteur'),  # east-west location,   X
            LAT = pl.col('latitude').mean().over('id_compteur'),   # north-south location, Y
            PASSAGES = pl.col('nb_passages'),
        )
        .filter(pl.col('ID').is_not_null())
        .group_by(['ID', 'DATE','LON', 'LAT']).agg(pl.col('PASSAGES').sum())
        .with_columns(PASSAGES_BY_ID = pl.col('PASSAGES').sum().over('ID'))
        .with_columns(pl.col('PASSAGES').cast(pl.UInt16)) 
        .with_columns(pl.col('PASSAGES_BY_ID').cast(pl.UInt32)) 
        .sort(['ID', 'DATE'])
        .collect()
        .join(
            df_locations.select('ID', 'LOC', 'NEARBY'),
            on='ID',
            how='left'
        )
    )
    df.write_parquet('df.parquet')

print(df)
# create a dashboard to show:
#   slider to filter minimum passages value
# # Convert Polars DataFrame to a dictionary for Plotly
# heatmap_data = df.to_dict(as_series=False)

# Create the scatter map
center_lon = 0.5*(df['LON'].min() + df['LON'].max())
center_lat = 0.5*(df['LAT'].min() + df['LAT'].max())
fig = px.scatter_map(
    df.unique('ID'),
    lat='LAT', lon='LON',
    size='PASSAGES_BY_ID',
    color='PASSAGES_BY_ID', 
    zoom=9,
    center={'lat':center_lat, 'lon':center_lon},  
    map_style=my_map_style,
    opacity=0.75,
    custom_data=['LOC', 'NEARBY', 'PASSAGES_BY_ID', 'ID'],
)
fig.update_traces(
    hovertemplate =
        'Location: %{customdata[0]}<br>' +
        'Nearby: %{customdata[1]}<br>' +
        'Passages: %{customdata[2]:,d}<br>' +
        'ID: %{customdata[3]}<br>' +
        '<extra></extra>'
)
fig.update_layout(
    title=dict(text='Bicycle traffic by location')
)
fig.update(layout_coloraxis_showscale=False)
fig.show()

reading dataset from parquet file
shape: (10_446, 8)
┌───────────┬────────────┬────────────┬───────────┬──────────┬────────────┬────────────┬───────────┐
│ ID        ┆ DATE       ┆ LON        ┆ LAT       ┆ PASSAGES ┆ PASSAGES_B ┆ LOC        ┆ NEARBY    │
│ ---       ┆ ---        ┆ ---        ┆ ---       ┆ ---      ┆ Y_ID       ┆ ---        ┆ ---       │
│ u32       ┆ date       ┆ f64        ┆ f64       ┆ u16      ┆ ---        ┆ str        ┆ str       │
│           ┆            ┆            ┆           ┆          ┆ u32        ┆            ┆           │
╞═══════════╪════════════╪════════════╪═══════════╪══════════╪════════════╪════════════╪═══════════╡
│ 100001753 ┆ 2024-01-01 ┆ -73.544424 ┆ 45.530216 ┆ 65       ┆ 188124     ┆ Rue        ┆ Rue Notre │
│           ┆            ┆            ┆           ┆          ┆            ┆ Notre-Dame ┆ -Dame     │
│           ┆            ┆            ┆           ┆          ┆            ┆ Est, Ville ┆ Est, Vill │
│           ┆            ┆            

In [3]:
df['PASSAGES_BY_ID'].value_counts().sort('count')# .shape

PASSAGES_BY_ID,count
u32,u32
58450,15
53668,78
35605,160
872,162
114456,162
…,…
253512,222
127976,222
563878,222
245674,222


In [4]:
df_cat = pl.DataFrame({
    'X':[chr(i) for i in range(65,91)],
    'Y':[str(i) for i in range(65,91)],
    'Z': [i for i in range(65,91)],
})
df_cat

X,Y,Z
str,str,i64
"""A""","""65""",65
"""B""","""66""",66
"""C""","""67""",67
"""D""","""68""",68
"""E""","""69""",69
…,…,…
"""V""","""86""",86
"""W""","""87""",87
"""X""","""88""",88
"""Y""","""89""",89


In [5]:
df_cat.with_columns(pl.col('Y').cast(pl.Categorical))


X,Y,Z
str,cat,i64
"""A""","""65""",65
"""B""","""66""",66
"""C""","""67""",67
"""D""","""68""",68
"""E""","""69""",69
…,…,…
"""V""","""86""",86
"""W""","""87""",87
"""X""","""88""",88
"""Y""","""89""",89


In [26]:
import pandas as pd
df = pd.read_csv('pistes-cyclables-2024.csv')
df.to_parquet('pistes-cyclables-2024.parquet')
