In [1]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

today = datetime.datetime.now()

2025-01-23 07:53:12,618 INFO: Initializing external client


2025-01-23 07:53:12,621 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-01-23 07:53:14,811 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [4]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [5]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [6]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("main", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x1ae237a8040>,
 <git.remote.FetchInfo at 0x1ae24bc51c0>]

## Get the latest datetime present in the bike data

In [7]:
# last_bike_datetime = "2025-01-06 15:06:11 UTC"

bike_df = bike_fg.read()

last_bike_datetime = bike_df["datetime"].max()
last_bike_datetime = last_bike_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

last_bike_datetime

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Reading data from Hopsworks, using Hopsworks Feature Query Service   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.01s) 


'2025-01-22 00:00:00 UTC'

## Loop through the commits and convert the bike data into a dataframe

In [8]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [9]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit:  bc16b7ebe9bfa22013847c398fcfc8eef4560548  -  2025-01-22 23:49:00+00:00


Processing commit:  091ecf86bdaf90b1ac5647280dc608a24fb2606e  -  2025-01-22 22:49:04+00:00


Processing commit:  636bb91d4fc47b512eba10c407374eeece653c74  -  2025-01-22 21:49:00+00:00


Processing commit:  1e2630cc26e31a1910540101038959fa2a285324  -  2025-01-22 20:49:02+00:00


Processing commit:  a1f319a85e99ac712d5a4a2a419a537f712bfbd9  -  2025-01-22 19:48:59+00:00


Processing commit:  c3e7f659835074f094211421244590c910bbd0a1  -  2025-01-22 18:49:41+00:00


Processing commit:  24f6f82610f096bc51512d338adfeb207e74ef86  -  2025-01-22 17:49:02+00:00


Processing commit:  8de992f2a7c4aac09a80f3fbdf14d3c292e4200b  -  2025-01-22 16:49:50+00:00


Processing commit:  3ed6dcb5bd83d8f08ed0926630584c2e3d15f308  -  2025-01-22 15:49:17+00:00


Processing commit:  2b20d9b7f5cb2bc8701d0b7e0ec2c3f3ccf313dc  -  2025-01-22 14:49:11+00:00


Processing commit:  effec3f54e1f5c9568e6a66fb7e1f0ac72395b9f  -  2025-01-22 13:49:10+00:00


Processing commit:  0bf7f50bc95c66526e037e4a4e272755ae79df11  -  2025-01-22 12:46:17+00:00


Processing commit:  291923a44391ef80ba2fbc5008896406c4f53702  -  2025-01-22 11:49:04+00:00


Processing commit:  ae0ff2c8abe24313f3763dd9e41218acd5f863d9  -  2025-01-22 10:49:09+00:00


Processing commit:  aee7795d091e6d72ec5899a7004b5a9599efd155  -  2025-01-22 09:49:10+00:00


Processing commit:  ee40ce07fa9d7ba2968c4ef7cdbcda449610b6f2  -  2025-01-22 08:49:41+00:00


Processing commit:  abd855011f6a6a424b4e2a785aae2e9e28f98816  -  2025-01-22 07:49:13+00:00


Processing commit:  de7ccb94186df899c7ceafea6fea38c7989abf6a  -  2025-01-22 06:49:42+00:00


Processing commit:  0320995c2df0ddb1f1264dcb2a0b659ada679b8a  -  2025-01-22 05:49:07+00:00


Processing commit:  6adb298506ec650c2c0a46a208dd1be137b374ce  -  2025-01-22 04:49:12+00:00


Processing commit:  1ef2f837503f66d3c14ed0d9f7599e075b03a30d  -  2025-01-22 03:49:12+00:00


Processing commit:  885d805f71d3d1c36d9413e37418c7dc8b642c49  -  2025-01-22 02:57:54+00:00


Processing commit:  20d606ab1218ee47e798a5feab149739ea6f6863  -  2025-01-22 01:33:37+00:00


Processing commit:  90b9872cd7c7e8f6acf667234d554af2e64a0f7f  -  2025-01-22 00:49:46+00:00


breaking at:  2025-01-21 23:49:01+00:00
{'CLARENDON ROW': [[0, datetime.datetime(2025, 1, 22, 23, 49, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24C12CB0>)], [0, datetime.datetime(2025, 1, 22, 22, 49, 4, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24DD5720>)], [0, datetime.datetime(2025, 1, 22, 21, 49, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24DD56F0>)], [1, datetime.datetime(2025, 1, 22, 20, 49, 2, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24DD6050>)], [3, datetime.datetime(2025, 1, 22, 19, 48, 59, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24DE62C0>)], [5, datetime.datetime(2025, 1, 22, 18, 49, 41, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24DEF880>)], [7, datetime.datetime(2025, 1, 22, 17, 49, 2, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24F2CE80>)], [8, datetime.datetime(2025, 1, 22, 16, 49, 50, tzinfo=<git.objects.util.tzoffset object at 0x000001AE24F32440>)], [9, datetime.datetime(2025, 1, 22, 15, 4

In [10]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])

# if empty, do nothing
if not df_bike_today.empty:
    df_bike_today.dropna(inplace=True)
    df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
    df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
    df_bike_today['datetime'] = pd.to_datetime(df_bike_today['datetime'], utc=True).dt.floor('H') + datetime.timedelta(hours=1)
df_bike_today


Unnamed: 0,num_bikes_available,datetime,station
0,40.0,2025-01-23 00:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
1,40.0,2025-01-22 23:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
2,39.0,2025-01-22 22:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
3,39.0,2025-01-22 21:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
4,40.0,2025-01-22 20:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
5,40.0,2025-01-22 19:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
6,40.0,2025-01-22 18:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
7,24.0,2025-01-22 17:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
8,5.0,2025-01-22 16:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
9,5.0,2025-01-22 15:00:00+00:00,HEUSTON_BRIDGE_(NORTH)


## Fetch the weather data for the same time period

In [11]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}


Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-23 00:00:00+00:00,5.25,2.283766,0.0,0.0,11.720751,30442.521484,4.2,dublin
1,2025-01-23 01:00:00+00:00,4.95,1.879474,0.0,0.0,12.218805,30442.521484,4.2,dublin
2,2025-01-23 02:00:00+00:00,4.65,1.504234,0.0,0.0,12.313894,30442.521484,4.2,dublin
3,2025-01-23 03:00:00+00:00,4.55,1.268456,0.0,0.0,12.904882,30442.521484,4.2,dublin
4,2025-01-23 04:00:00+00:00,4.85,1.441718,0.0,0.0,13.570615,30442.521484,4.2,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-02-01 19:00:00+00:00,7.05,3.535300,0.0,0.0,14.843180,32284.416016,5.4,dublin
236,2025-02-01 20:00:00+00:00,6.35,2.731808,0.0,0.0,14.618837,32284.416016,5.4,dublin
237,2025-02-01 21:00:00+00:00,5.70,1.981850,0.0,0.0,14.471821,32284.416016,5.4,dublin
238,2025-02-01 22:00:00+00:00,5.00,1.166210,0.0,0.0,14.404499,32284.416016,5.4,dublin


## Insert the bike and weather data into Hopsworks

In [12]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 0.00% |                                      | Rows 0/24 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 58.33% |██████████████████▋             | Rows 14/24 | Elapsed Time: 00:01 | Remaining Time: 00:00

Uploading Dataframe: 100.00% |███████████████████████████████| Rows 24/24 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: bike_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [13]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |                                     | Rows 0/240 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 26.25% |████████▏                      | Rows 63/240 | Elapsed Time: 00:01 | Remaining Time: 00:02

Uploading Dataframe: 100.00% |█████████████████████████████| Rows 240/240 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: weather_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions


2025-01-23 07:54:22,284 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED


2025-01-23 07:54:25,463 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


2025-01-23 07:56:23,386 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED


2025-01-23 07:56:26,568 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED


2025-01-23 07:56:26,727 INFO: Waiting for log aggregation to finish.


2025-01-23 07:56:43,048 INFO: Execution finished successfully.
