In [116]:
import pandas as pd
import numpy as np
import json
import folium
from datetime import datetime
import os

import yaml

In [117]:
from data_cleaning.json_manipulation import load_json, trim_json_to_locations, gather_bajs_locations, get_bikes_in_stations, calculate_bike_changes, create_changes_columns
from visualization.station_map import populate_map_with_stations

In [118]:
config = yaml.safe_load(open("../config.YAML"))

In [119]:
with open("../data/test_JSON_data/bajs_10-11-2025_15-11-31.json") as json_data:
    day_one = json.load(json_data)
    json_data.close()

In [120]:
day_one

{'countries': [{'lat': 45.8049,
   'lng': 15.9467,
   'zoom': 10,
   'name': 'Bajs Zagreb (Croatia)',
   'hotline': '0038515494477',
   'domain': 'hd',
   'language': 'hr',
   'email': 'bajs@zagreb.hr',
   'timezone': 'Europe/Berlin',
   'currency': 'EUR',
   'country_calling_code': '+385',
   'system_operator_address': 'Sustav javnih bicikala d.o.o., Prisavlje 2, 10000 Zagreb',
   'country': 'HR',
   'country_name': 'Croatia',
   'terms': 'https://bajs.zagreb.hr/hr/uvjetikoristenja/',
   'policy': 'https://bajs.zagreb.hr/hr/pravila-privatnosti/',
   'website': 'https://bajs.zagreb.hr',
   'show_bike_types': False,
   'show_bike_type_groups': False,
   'show_free_racks': False,
   'booked_bikes': 0,
   'set_point_bikes': 2000,
   'available_bikes': 1812,
   'capped_available_bikes': False,
   'no_registration': False,
   'pricing': 'https://bajs.zagreb.hr/hr/',
   'vat': '25',
   'faq_url': 'https://bajs.zagreb.hr',
   'store_uri_android': 'https://play.google.com/store/apps/details?id

In [121]:
trim_json_to_locations(day_one)

[{'uid': 556632908,
  'lat': 45.825299,
  'lng': 16.109747,
  'bike': False,
  'name': 'AUTOBUSNI TERMINAL SESVETE',
  'address': None,
  'spot': True,
  'number': 21200,
  'booked_bikes': 0,
  'bikes': 5,
  'bikes_available_to_rent': 5,
  'active_place': 1,
  'bike_racks': 8,
  'free_racks': 3,
  'special_racks': 0,
  'free_special_racks': 0,
  'maintenance': False,
  'terminal_type': 'free',
  'bike_numbers': ['800463', '800020', '801638', '801088', '801072'],
  'bike_types': {'196': 4, '409': 1},
  'place_type': '0',
  'rack_locks': False},
 {'uid': 556633495,
  'lat': 45.842797,
  'lng': 15.975421,
  'bike': False,
  'name': 'MIHALJEVAC OKRETIŠTE',
  'address': None,
  'spot': True,
  'number': 21201,
  'booked_bikes': 0,
  'bikes': 11,
  'bikes_available_to_rent': 11,
  'active_place': 1,
  'bike_racks': 12,
  'free_racks': 1,
  'special_racks': 0,
  'free_special_racks': 0,
  'maintenance': False,
  'terminal_type': 'free',
  'bike_numbers': ['800480',
   '800247',
   '800185',
 

In [122]:
locations = gather_bajs_locations(trim_json_to_locations(day_one))
locations.head()

Unnamed: 0,uid,name,lat,lng,no_racks
0,556632908,AUTOBUSNI TERMINAL SESVETE,45.825299,16.109747,8
1,556633495,MIHALJEVAC OKRETIŠTE,45.842797,15.975421,12
2,556634009,ROTOR REMETINEC,45.776795,15.953339,10
3,556637365,ADMIRAL HOTEL,45.795083,15.919185,20
4,556689563,ZAGREBAČKI VELESAJAM,45.77756,15.969703,20


In [123]:
locations.shape
# 165 different stations in Zagreb

(165, 5)

Thunderforest.Neighbourhood
OSMBright
Esri.WorldTopoMap
CartoDB.Voyager

In [124]:
fig = folium.Figure(width = 1100, height = 500)

m = folium.Map(
    location=(45.8109, 16.0097),
    min_zoom=11,
    tiles = "Esri.WorldTopoMap"
)

fig.add_child(m)

In [125]:
m = populate_map_with_stations(m, locations)


In [126]:
m

## Individual bikes at individual times

In [127]:
first_dt = datetime.strptime("2025/11/10 15:11:31",
    "%Y/%m/%d %H:%M:%S")

first_dt

datetime.datetime(2025, 11, 10, 15, 11, 31)

In [128]:
time1 = get_bikes_in_stations(time = first_dt,
    loc_list = trim_json_to_locations(day_one))

time1.sort_values("uid").head()

Unnamed: 0,uid,time,bikes_at_station
0,556632908,2025-11-10 15:11:31,"[800463, 800020, 801638, 801088, 801072]"
1,556633495,2025-11-10 15:11:31,"[800480, 800247, 800185, 800183, 800106, 80005..."
2,556634009,2025-11-10 15:11:31,"[800220, 800201, 800129, 800079, 801644, 80155..."
3,556637365,2025-11-10 15:11:31,"[801920, 801915, 800748, 800640, 800625]"
4,556689563,2025-11-10 15:11:31,"[801846, 801835, 801548, 801542, 801000, 80099..."


In [129]:
# For testing purposes, we'll use this data as a second timepoint even though it's been more than a day
with open("../data/test_JSON_data/bajs_12-11-2025_09-30-11.json") as json_data:
    day_two = json.load(json_data)
    json_data.close()

In [130]:
second_dt = datetime.strptime("2025/11/10 16:30:00",
    "%Y/%m/%d %H:%M:%S")

time2 = get_bikes_in_stations(time = second_dt,
    loc_list = trim_json_to_locations(day_two))

time2.sort_values("uid").head()

Unnamed: 0,uid,time,bikes_at_station
0,556632908,2025-11-10 16:30:00,"[800479, 800463, 800282, 801088]"
1,556633495,2025-11-10 16:30:00,"[800480, 800185, 800052, 801489, 801321]"
2,556634009,2025-11-10 16:30:00,"[800274, 801882, 801360, 801296, 801022]"
3,556637365,2025-11-10 16:30:00,"[800344, 800033, 801920, 801915, 801881, 80164..."
4,556689563,2025-11-10 16:30:00,"[800485, 800329, 800233, 800230, 800223, 80019..."


In [131]:
more_times = pd.concat([time1, time2])
# for each uid-time, what is the closest observed time before the one in this row?
more_times["lag1_time"] = more_times.groupby(["uid"])["time"].shift(1)

more_times

previous_bikes = more_times.merge(more_times.drop(["lag1_time"], axis = 1), # so we don't get this as an additional column
    how = "left",
    left_on = ["uid", "lag1_time"],
    right_on = ["uid", "time"],
    suffixes = (None, "_lag1")).drop(
        ["time_lag1"], axis = 1
    )



previous_bikes

Unnamed: 0,uid,time,bikes_at_station,lag1_time,bikes_at_station_lag1
0,556632908,2025-11-10 15:11:31,"[800463, 800020, 801638, 801088, 801072]",NaT,
1,556633495,2025-11-10 15:11:31,"[800480, 800247, 800185, 800183, 800106, 80005...",NaT,
2,556634009,2025-11-10 15:11:31,"[800220, 800201, 800129, 800079, 801644, 80155...",NaT,
3,556637365,2025-11-10 15:11:31,"[801920, 801915, 800748, 800640, 800625]",NaT,
4,556689563,2025-11-10 15:11:31,"[801846, 801835, 801548, 801542, 801000, 80099...",NaT,
...,...,...,...,...,...
326,585913527,2025-11-10 16:30:00,"[800278, 800101, 801656, 801228, 801046, 80077...",2025-11-10 15:11:31,[]
327,585913605,2025-11-10 16:30:00,"[801717, 801191, 801157, 800603, 800601, 800517]",2025-11-10 15:11:31,[801963]
328,585913702,2025-11-10 16:30:00,"[800347, 800115, 801866, 801826, 801795, 80166...",2025-11-10 15:11:31,"[800293, 800219, 800191, 800068, 801907, 80175..."
329,586256300,2025-11-10 16:30:00,"[800144, 801889, 801867, 801457, 801066, 80095...",2025-11-10 15:11:31,"[800318, 800144, 801889, 801867, 801169, 80116..."


In [132]:
calculate_bike_changes(previous_bikes["bikes_at_station"], previous_bikes["bikes_at_station_lag1"])

{'changes': 0       NaN
 1       NaN
 2       NaN
 3       NaN
 4       NaN
        ... 
 326     8.0
 327     7.0
 328    31.0
 329     5.0
 330     5.0
 Length: 331, dtype: float64,
 'incoming': 0       NaN
 1       NaN
 2       NaN
 3       NaN
 4       NaN
        ... 
 326     0.0
 327     1.0
 328    15.0
 329     3.0
 330     0.0
 Length: 331, dtype: float64,
 'outgoing': 0       NaN
 1       NaN
 2       NaN
 3       NaN
 4       NaN
        ... 
 326     8.0
 327     6.0
 328    16.0
 329     2.0
 330     5.0
 Length: 331, dtype: float64}

## Load all data from folder and calculate changes

In [133]:
raw_files_directory = "../data/" + config["data"]["raw_json_folder"]

raw_files = os.listdir(raw_files_directory)
raw_files

['bajs_10-11-2025_17-04-53.json',
 'bajs_10-11-2025_18-24-00.json',
 'bajs_10-11-2025_19-21-13.json',
 'bajs_10-11-2025_20-15-53.json',
 'bajs_10-11-2025_21-02-43.json',
 'bajs_10-11-2025_22-14-02.json',
 'bajs_10-11-2025_23-01-43.json',
 'bajs_11-11-2025_00-15-08.json',
 'bajs_11-11-2025_01-13-15.json',
 'bajs_11-11-2025_02-05-07.json',
 'bajs_11-11-2025_03-17-33.json',
 'bajs_11-11-2025_04-00-29.json',
 'bajs_11-11-2025_05-04-41.json',
 'bajs_11-11-2025_06-27-27.json',
 'bajs_11-11-2025_07-19-42.json',
 'bajs_11-11-2025_08-17-33.json',
 'bajs_11-11-2025_09-06-29.json',
 'bajs_11-11-2025_10-13-20.json',
 'bajs_11-11-2025_11-06-31.json',
 'bajs_11-11-2025_12-24-02.json',
 'bajs_11-11-2025_13-00-30.json',
 'bajs_11-11-2025_14-08-36.json',
 'bajs_11-11-2025_15-00-39.json',
 'bajs_11-11-2025_16-29-45.json',
 'bajs_11-11-2025_17-04-53.json',
 'bajs_11-11-2025_18-05-53.json',
 'bajs_11-11-2025_19-03-15.json',
 'bajs_11-11-2025_20-05-27.json',
 'bajs_11-11-2025_21-09-35.json',
 'bajs_11-11-2

In [134]:
raw_files[0][5:-5]

'10-11-2025_17-04-53'

In [135]:
#TODO - extracting from first to last number instead of absolute position?

datetime.strptime(raw_files[0][5:-5], "%d-%m-%Y_%H-%M-%S")

datetime.datetime(2025, 11, 10, 17, 4, 53)

In [136]:
scraped_datetimes = [datetime.strptime(filename[5:-5], "%d-%m-%Y_%H-%M-%S") for filename in raw_files]
scraped_datetimes

[datetime.datetime(2025, 11, 10, 17, 4, 53),
 datetime.datetime(2025, 11, 10, 18, 24),
 datetime.datetime(2025, 11, 10, 19, 21, 13),
 datetime.datetime(2025, 11, 10, 20, 15, 53),
 datetime.datetime(2025, 11, 10, 21, 2, 43),
 datetime.datetime(2025, 11, 10, 22, 14, 2),
 datetime.datetime(2025, 11, 10, 23, 1, 43),
 datetime.datetime(2025, 11, 11, 0, 15, 8),
 datetime.datetime(2025, 11, 11, 1, 13, 15),
 datetime.datetime(2025, 11, 11, 2, 5, 7),
 datetime.datetime(2025, 11, 11, 3, 17, 33),
 datetime.datetime(2025, 11, 11, 4, 0, 29),
 datetime.datetime(2025, 11, 11, 5, 4, 41),
 datetime.datetime(2025, 11, 11, 6, 27, 27),
 datetime.datetime(2025, 11, 11, 7, 19, 42),
 datetime.datetime(2025, 11, 11, 8, 17, 33),
 datetime.datetime(2025, 11, 11, 9, 6, 29),
 datetime.datetime(2025, 11, 11, 10, 13, 20),
 datetime.datetime(2025, 11, 11, 11, 6, 31),
 datetime.datetime(2025, 11, 11, 12, 24, 2),
 datetime.datetime(2025, 11, 11, 13, 0, 30),
 datetime.datetime(2025, 11, 11, 14, 8, 36),
 datetime.dateti

In [137]:
data_timepoints = [
    trim_json_to_locations(load_json(raw_files_directory + "/" + json_file))
    for json_file
    in raw_files
]

len(data_timepoints)

113

In [138]:
bike_data = None

for i in range(len(data_timepoints)):
    timepoint_bikes = get_bikes_in_stations(time = scraped_datetimes[i],
        loc_list = data_timepoints[i])

    if i == 0: 
        bike_data = timepoint_bikes
    else:
        bike_data = pd.concat([bike_data, timepoint_bikes])

bike_data

Unnamed: 0,uid,time,bikes_at_station
0,556632908,2025-11-10 17:04:53,"[800463, 800020, 801638, 801088, 800874]"
1,556633495,2025-11-10 17:04:53,"[800480, 800247, 800185, 800183, 800106, 80005..."
2,556634009,2025-11-10 17:04:53,"[800220, 800201, 800129, 800079, 801882, 80164..."
3,556637365,2025-11-10 17:04:53,"[800345, 800193, 801920, 801915, 801881, 80164..."
4,556689563,2025-11-10 17:04:53,"[801846, 801548, 801347, 801000, 800817, 80062..."
...,...,...,...
163,585913605,2025-11-15 09:14:36,"[800360, 801796, 801587, 801486, 801278, 80113..."
164,585913702,2025-11-15 09:14:36,"[800169, 801866, 800910]"
165,586256300,2025-11-15 09:14:36,"[800452, 800144, 801889, 801867, 801840, 80112..."
166,586273004,2025-11-15 09:14:36,"[800199, 800119, 800061, 800044, 800034, 80195..."


In [139]:
bike_data = create_changes_columns(bike_data)
bike_data

Unnamed: 0,uid,time,bikes_at_station,lag1_time,bikes_at_station_lag1,changes,incoming,outgoing
0,556632908,2025-11-10 17:04:53,"[800463, 800020, 801638, 801088, 800874]",NaT,,,,
1,556633495,2025-11-10 17:04:53,"[800480, 800247, 800185, 800183, 800106, 80005...",NaT,,,,
2,556634009,2025-11-10 17:04:53,"[800220, 800201, 800129, 800079, 801882, 80164...",NaT,,,,
3,556637365,2025-11-10 17:04:53,"[800345, 800193, 801920, 801915, 801881, 80164...",NaT,,,,
4,556689563,2025-11-10 17:04:53,"[801846, 801548, 801347, 801000, 800817, 80062...",NaT,,,,
...,...,...,...,...,...,...,...,...
18825,585913605,2025-11-15 09:14:36,"[800360, 801796, 801587, 801486, 801278, 80113...",2025-11-15 08:03:09,"[800360, 801796, 801587, 801486, 801278, 80113...",0.0,0.0,0.0
18826,585913702,2025-11-15 09:14:36,"[800169, 801866, 800910]",2025-11-15 08:03:09,"[800169, 801866, 800910]",0.0,0.0,0.0
18827,586256300,2025-11-15 09:14:36,"[800452, 800144, 801889, 801867, 801840, 80112...",2025-11-15 08:03:09,"[800452, 800144, 801889, 801867, 801840, 80112...",0.0,0.0,0.0
18828,586273004,2025-11-15 09:14:36,"[800199, 800119, 800061, 800044, 800034, 80195...",2025-11-15 08:03:09,"[800199, 800119, 800061, 800044, 800034, 80195...",2.0,0.0,2.0


## Total changes on map

In [140]:
# For each station, sum up all changes through time
# Also, weight changes by rack size

total_changes = bike_data.groupby("uid").sum(["changes"])
total_changes = total_changes.rename({"changes":"total_changes"}, axis = 1)
total_changes

Unnamed: 0_level_0,total_changes,incoming,outgoing
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
556632908,32.0,19.0,13.0
556633495,36.0,18.0,18.0
556634009,214.0,100.0,114.0
556637365,165.0,85.0,80.0
556689563,528.0,270.0,258.0
...,...,...,...
585913605,95.0,52.0,43.0
585913702,126.0,62.0,64.0
586256300,15.0,8.0,7.0
586273004,95.0,54.0,41.0


In [141]:
locations_changes = locations.join(total_changes, on = "uid")
locations_changes["weighted_changes"] = locations_changes["total_changes"] / locations_changes["no_racks"]
locations_changes

Unnamed: 0,uid,name,lat,lng,no_racks,total_changes,incoming,outgoing,weighted_changes
0,556632908,AUTOBUSNI TERMINAL SESVETE,45.825299,16.109747,8,32.0,19.0,13.0,4.00
1,556633495,MIHALJEVAC OKRETIŠTE,45.842797,15.975421,12,36.0,18.0,18.0,3.00
2,556634009,ROTOR REMETINEC,45.776795,15.953339,10,214.0,100.0,114.0,21.40
3,556637365,ADMIRAL HOTEL,45.795083,15.919185,20,165.0,85.0,80.0,8.25
4,556689563,ZAGREBAČKI VELESAJAM,45.777560,15.969703,20,528.0,270.0,258.0,26.40
...,...,...,...,...,...,...,...,...,...
160,585913527,GUPČEVA ZVIJEZDA,45.827987,15.979160,10,78.0,39.0,39.0,7.80
161,585913605,PETROVA UL. - UL. IVANA ZAJCA,45.817526,15.997421,10,95.0,52.0,43.0,9.50
162,585913702,KORANSKA UL.,45.802365,15.968013,10,126.0,62.0,64.0,12.60
163,586256300,GAJNICE UL. - GRINTAVEČKA UL.,45.816342,15.872686,10,15.0,8.0,7.0,1.50


In [142]:
fig = folium.Figure(width = 1100, height = 500)

m = folium.Map(
    location=(45.8109, 16.0097),
    min_zoom=11,
    tiles = "Esri.WorldTopoMap"
)

fig.add_child(m)

In [143]:
m = populate_map_with_stations(m, 
                               locations_changes, 
                               metric_size = "total_changes", 
                               metric_tooltip_name = "Bike changes: ",
                               no_bins = 12)

In [144]:
m

In [145]:
a = [1, 2, 3, 4, 5]
b = [3,4, 6, 7]

# outgoing - sum 
[bike not in b for bike in a]


[True, True, False, False, True]

In [146]:
sum([bike not in b for bike in a])

3

In [147]:
#incoming - sum 
[bike not in a for bike in b]

[False, False, True, True]

In [148]:
locations_changes.loc[locations_changes["name"] == "FILOZOFSKI FAKULTET"]

Unnamed: 0,uid,name,lat,lng,no_racks,total_changes,incoming,outgoing,weighted_changes
58,579364165,FILOZOFSKI FAKULTET,45.796235,15.970371,20,598.0,290.0,308.0,29.9


In [149]:
ffzg = bike_data.loc[bike_data["uid"] == 579364165]

ffzg

Unnamed: 0,uid,time,bikes_at_station,lag1_time,bikes_at_station_lag1,changes,incoming,outgoing
58,579364165,2025-11-10 17:04:53,"[800289, 800216, 800179, 800077, 800067, 80187...",NaT,,,,
223,579364165,2025-11-10 18:24:00,"[800289, 800064, 801879, 801577, 801568, 80142...",2025-11-10 17:04:53,"[800289, 800216, 800179, 800077, 800067, 80187...",15.0,2.0,13.0
388,579364165,2025-11-10 19:21:13,[801373],2025-11-10 18:24:00,"[800289, 800064, 801879, 801577, 801568, 80142...",11.0,1.0,10.0
553,579364165,2025-11-10 20:15:53,[],2025-11-10 19:21:13,[801373],1.0,0.0,1.0
718,579364165,2025-11-10 21:02:43,[],2025-11-10 20:15:53,[],0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
18049,579364165,2025-11-15 05:03:15,"[800387, 801772]",2025-11-15 04:10:38,[801772],1.0,1.0,0.0
18217,579364165,2025-11-15 06:06:04,"[800387, 801772]",2025-11-15 05:03:15,"[800387, 801772]",0.0,0.0,0.0
18385,579364165,2025-11-15 07:10:27,"[800387, 801772]",2025-11-15 06:06:04,"[800387, 801772]",0.0,0.0,0.0
18553,579364165,2025-11-15 08:03:09,[801772],2025-11-15 07:10:27,"[800387, 801772]",1.0,0.0,1.0


We can see that the number of racks DOESN'T necessarily mean that this is the maximum amount of bikes that can be at a station at the same time. Maybe they're counted if they're in the near vicinity?  

Still, the no_racks is still useful as an approximation of how large/frequently used the station is planned to be.

In [150]:
len(ffzg["bikes_at_station"][58])

21

In [151]:
len(ffzg["bikes_at_station"][223])

10