In [1]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

today = datetime.datetime.now()

2025-01-26 10:55:45,938 INFO: Initializing external client


2025-01-26 10:55:45,956 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-01-26 10:55:47,882 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [4]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [5]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [6]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("main", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x253c2744720>,
 <git.remote.FetchInfo at 0x253c2745a80>]

## Get the latest datetime present in the bike data

In [7]:
# last_bike_datetime = "2025-01-06 15:06:11 UTC"

bike_df = bike_fg.read()

last_bike_datetime = bike_df["datetime"].max()
last_bike_datetime = last_bike_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

last_bike_datetime

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.23s) 


'2025-01-25 00:00:00 UTC'

## Loop through the commits and convert the bike data into a dataframe

In [8]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [9]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit: 

 c5803482f3f62cc0b0272be82c6c0aabe6e8756f  -  2025-01-25 23:48:56+00:00


Processing commit:  c4d05efd93a73e7deeb765949e1fb9ab9a675614  -  2025-01-25 22:49:00+00:00


Processing commit:  563c7725b0a8b929eab6cf55b43f68f2acff6492  -  2025-01-25 21:48:58+00:00


Processing commit:  ab288b0513691eaf607ef0c3ea468f82f4c0d6ef  -  2025-01-25 20:48:52+00:00


Processing commit:  19cb3e1fc774834c243640c4a50101f5b1b25649  -  2025-01-25 19:49:08+00:00


Processing commit:  e4b18c0bd20c6f353534bd0b251746d943017733  -  2025-01-25 18:49:05+00:00


Processing commit:  8d2862e23fdd296580193bd56b60d82d4f1fef61  -  2025-01-25 17:49:04+00:00


Processing commit:  60652b83b9c6a1ac0d7cc1fded7f63cb49e402f3  -  2025-01-25 16:48:58+00:00


Processing commit:  295f8d8681fa20993bbe06c2824e264d2ae827e0  -  2025-01-25 15:49:19+00:00


Processing commit:  5fced48ad8274a259db2e5616984384ac08eafee  -  2025-01-25 14:49:00+00:00


Processing commit:  27c32c710f64cee43429cc562014e80e9845563c  -  2025-01-25 13:49:25+00:00


Processing commit:  8832cbb50315c96ac9592c2396b278e8f6a812bf  -  2025-01-25 12:55:45+00:00


Processing commit:  28221fb98d08d55cec5db6b85156e9439125941e  -  2025-01-25 11:49:06+00:00


Processing commit:  3ad9fdbc2dbf86b74cea8b0275b4264456bc3388  -  2025-01-25 10:49:06+00:00


Processing commit:  ab524996a39bb0cd60abfc800b58e06464d399c5  -  2025-01-25 09:49:06+00:00


Processing commit:  afd624c7cdc4e9bbea598c6c1e796c64213ac709  -  2025-01-25 08:49:04+00:00


Processing commit:  577e26ee4827dc1ebe63c48e568dd71f55ac30a5  -  2025-01-25 07:49:14+00:00


Processing commit:  36de12acb1cf2d642e0855e1bcae02ca79de5898  -  2025-01-25 06:49:11+00:00


Processing commit:  f12c4799fbc9d7f8ebdde73cf7f356485b4edb2c  -  2025-01-25 05:49:13+00:00


Processing commit:  92654898fafcb85f5902616929da120917ea531b  -  2025-01-25 04:49:06+00:00


Processing commit:  800d4021bba4df58a4ccc63c795fd428ec858335  -  2025-01-25 03:49:02+00:00


Processing commit:  fd11bae3a09ef53d2ef42daf5121fad14e67b094  -  2025-01-25 02:54:23+00:00


Processing commit:  78f5ac8b3aa61ce79bf29d4384f296a877a540a5  -  2025-01-25 01:32:45+00:00


Processing commit:  182e03f3f230c48b3fa6387c11127e9cf3f39d2c  -  2025-01-25 00:50:02+00:00


breaking at:  2025-01-24 23:49:14+00:00
{'CLARENDON ROW': [[25, datetime.datetime(2025, 1, 25, 23, 48, 56, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28D6110>)], [26, datetime.datetime(2025, 1, 25, 22, 49, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28D61D0>)], [27, datetime.datetime(2025, 1, 25, 21, 48, 58, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28D6140>)], [25, datetime.datetime(2025, 1, 25, 20, 48, 52, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28D6200>)], [24, datetime.datetime(2025, 1, 25, 19, 49, 8, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28E6680>)], [26, datetime.datetime(2025, 1, 25, 18, 49, 5, tzinfo=<git.objects.util.tzoffset object at 0x00000253C28EFC40>)], [29, datetime.datetime(2025, 1, 25, 17, 49, 4, tzinfo=<git.objects.util.tzoffset object at 0x00000253C2A3BEE0>)], [26, datetime.datetime(2025, 1, 25, 16, 48, 58, tzinfo=<git.objects.util.tzoffset object at 0x00000253C2A32800>)], [24, datetime.datetime(2025,

In [10]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])

# if empty, do nothing
if not df_bike_today.empty:
    df_bike_today.dropna(inplace=True)
    df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
    df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
    df_bike_today['datetime'] = pd.to_datetime(df_bike_today['datetime'], utc=True).dt.floor('H') + datetime.timedelta(hours=1)
df_bike_today


Unnamed: 0,num_bikes_available,datetime,station
0,37.0,2025-01-26 00:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
1,38.0,2025-01-25 23:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
2,38.0,2025-01-25 22:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
3,39.0,2025-01-25 21:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
4,40.0,2025-01-25 20:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
5,40.0,2025-01-25 19:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
6,40.0,2025-01-25 18:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
7,40.0,2025-01-25 17:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
8,40.0,2025-01-25 16:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
9,38.0,2025-01-25 15:00:00+00:00,HEUSTON_BRIDGE_(NORTH)


## Fetch the weather data for the same time period

In [11]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-26 00:00:00+00:00,1.50,-3.151039,0.0,0.0,17.848160,31026.716797,8.999999,dublin
1,2025-01-26 01:00:00+00:00,0.80,-3.497799,0.0,0.0,14.529915,31026.716797,8.999999,dublin
2,2025-01-26 02:00:00+00:00,-0.00,-3.801593,0.0,0.0,10.182337,31026.716797,8.999999,dublin
3,2025-01-26 03:00:00+00:00,-0.35,-3.654368,0.0,0.0,6.297428,31026.716797,8.999999,dublin
4,2025-01-26 04:00:00+00:00,0.05,-3.135595,0.0,0.0,5.860375,31026.716797,8.999999,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-02-04 19:00:00+00:00,6.90,4.036785,0.2,0.0,12.069400,32959.515625,2.800000,dublin
236,2025-02-04 20:00:00+00:00,6.65,3.882977,0.2,0.0,11.275530,32959.515625,2.800000,dublin
237,2025-02-04 21:00:00+00:00,6.40,3.793025,0.2,0.0,10.041354,32959.515625,2.800000,dublin
238,2025-02-04 22:00:00+00:00,6.10,3.649094,0.2,0.0,8.759178,32959.515625,2.800000,dublin


## Insert the bike and weather data into Hopsworks

In [12]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 0.00% |                                      | Rows 0/24 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 20.83% |██████▉                          | Rows 5/24 | Elapsed Time: 00:01 | Remaining Time: 00:03

Uploading Dataframe: 100.00% |███████████████████████████████| Rows 24/24 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: bike_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [13]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |                                     | Rows 0/240 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 100.00% |█████████████████████████████| Rows 240/240 | Elapsed Time: 00:00 | Remaining Time: 00:00




Launching job: weather_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions


2025-01-26 10:59:12,141 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED


2025-01-26 10:59:15,347 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


2025-01-26 11:00:57,820 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED


2025-01-26 11:00:57,990 INFO: Waiting for log aggregation to finish.


2025-01-26 11:01:23,408 INFO: Execution finished successfully.
