In [1]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

today = datetime.datetime.now()

2025-01-27 08:38:24,011 INFO: Initializing external client


2025-01-27 08:38:24,014 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-01-27 08:38:26,916 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [4]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [5]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [6]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("main", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x21824fa2160>,
 <git.remote.FetchInfo at 0x2182501ab60>]

## Get the latest datetime present in the bike data

In [7]:
# last_bike_datetime = "2025-01-06 15:06:11 UTC"

bike_df = bike_fg.read()

last_bike_datetime = bike_df["datetime"].max()
last_bike_datetime = last_bike_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

last_bike_datetime

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Reading data from Hopsworks, using Hopsworks Feature Query Service   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.59s) 


'2025-01-26 00:00:00 UTC'

## Loop through the commits and convert the bike data into a dataframe

In [8]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [9]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit:  426ac096ea5398fc8121ed1d3be29e6e30a69eb0  -  2025-01-26 23:49:04+00:00


Processing commit:  78f09d4b67c3d5683eb045cca7d1ce1643c6c3a8  -  2025-01-26 22:48:54+00:00


Processing commit:  64b39940741f23a78b02e7cc3574ccb63925b7e3  -  2025-01-26 21:49:04+00:00


Processing commit:  caee5ef30a38e14ef7230eae6bfb79f06bb93476  -  2025-01-26 20:48:56+00:00


Processing commit:  0a304717e6a9c670a7b0266094f7de366983c180  -  2025-01-26 19:48:56+00:00


Processing commit:  f31a74965222b125ede2ee0b0d35d4faa3c79835  -  2025-01-26 18:49:15+00:00


Processing commit:  8970fcf416ef41db2503803d02960e73abd4a5ac  -  2025-01-26 17:49:01+00:00


Processing commit:  2b22dcd95b757deeaf13bce3a03b49afe10d992a  -  2025-01-26 16:49:11+00:00


Processing commit:  70708e760cc1e8042910f92c50bebe0578f7c0d2  -  2025-01-26 15:48:59+00:00


Processing commit:  742527af7b710f9de9753f98df253f30553ba20b  -  2025-01-26 14:49:02+00:00


Processing commit:  a3cd66ba87434ebedbe0c0d64bb09646b319b663  -  2025-01-26 13:49:06+00:00


Processing commit:  253fac3e2943c62a183c7f96e0cd44e10eba1a18  -  2025-01-26 12:55:13+00:00


Processing commit:  76f05eb2cfd3595d9369be447a1e569cc7ae3218  -  2025-01-26 11:49:08+00:00


Processing commit:  36d990745dc0fb723b506837faec5190db7ec35e  -  2025-01-26 10:48:57+00:00


Processing commit:  550d29b9723c970d1eb3a1456f3ab3a2875811ce  -  2025-01-26 09:49:09+00:00


Processing commit:  6e18e5da46d4462fd15c431faf2ed8ce0511e478  -  2025-01-26 08:49:09+00:00


Processing commit:  d6541c941c029c2d8975371d05d271090413c9f2  -  2025-01-26 07:49:03+00:00


Processing commit:  c9ba542eba04b1d6b7b24b991b24c606df7d6aea  -  2025-01-26 06:49:04+00:00


Processing commit:  bf4a35fea8ba506740646d2c2d06dc7b5dc62da3  -  2025-01-26 05:49:09+00:00


Processing commit:  129fd70954e4abd5dc1f65a31afff71088b2fd3a  -  2025-01-26 04:49:10+00:00


Processing commit:  405124f7b617dbeeefe20ad80667df961a136ec0  -  2025-01-26 03:49:07+00:00


Processing commit:  88b05df2f27a1ffc7eb442851bccb973e0574394  -  2025-01-26 02:57:27+00:00


Processing commit:  62e187751466b0fa9b44c00e0e877b213d0411e5  -  2025-01-26 01:33:00+00:00


Processing commit:  69aa46a63157cd842284d832a97bb0472588d307  -  2025-01-26 00:49:50+00:00


breaking at:  2025-01-25 23:48:56+00:00
{'CLARENDON ROW': [[22, datetime.datetime(2025, 1, 26, 23, 49, 4, tzinfo=<git.objects.util.tzoffset object at 0x00000218250C8580>)], [22, datetime.datetime(2025, 1, 26, 22, 48, 54, tzinfo=<git.objects.util.tzoffset object at 0x00000218250CA2F0>)], [23, datetime.datetime(2025, 1, 26, 21, 49, 4, tzinfo=<git.objects.util.tzoffset object at 0x0000021824ECDE10>)], [24, datetime.datetime(2025, 1, 26, 20, 48, 56, tzinfo=<git.objects.util.tzoffset object at 0x00000218250C9F90>)], [26, datetime.datetime(2025, 1, 26, 19, 48, 56, tzinfo=<git.objects.util.tzoffset object at 0x00000218250E6830>)], [28, datetime.datetime(2025, 1, 26, 18, 49, 15, tzinfo=<git.objects.util.tzoffset object at 0x00000218250C85B0>)], [31, datetime.datetime(2025, 1, 26, 17, 49, 1, tzinfo=<git.objects.util.tzoffset object at 0x0000021825237F70>)], [30, datetime.datetime(2025, 1, 26, 16, 49, 11, tzinfo=<git.objects.util.tzoffset object at 0x0000021825237FA0>)], [31, datetime.datetime(2

In [10]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])

# if empty, do nothing
if not df_bike_today.empty:
    df_bike_today.dropna(inplace=True)
    df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
    df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
    df_bike_today['datetime'] = pd.to_datetime(df_bike_today['datetime'], utc=True).dt.floor('H') + datetime.timedelta(hours=1)
df_bike_today


Unnamed: 0,num_bikes_available,datetime,station
0,39.0,2025-01-27 00:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
1,39.0,2025-01-26 23:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
2,39.0,2025-01-26 22:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
3,40.0,2025-01-26 21:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
4,40.0,2025-01-26 20:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
5,40.0,2025-01-26 19:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
6,40.0,2025-01-26 18:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
7,40.0,2025-01-26 17:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
8,40.0,2025-01-26 16:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
9,39.0,2025-01-26 15:00:00+00:00,HEUSTON_BRIDGE_(NORTH)


## Fetch the weather data for the same time period

In [11]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-27 00:00:00+00:00,5.20,1.812418,0.0,0.0,12.641076,31225.949219,6.35,dublin
1,2025-01-27 01:00:00+00:00,5.00,1.577863,0.0,0.0,12.413476,31225.949219,6.35,dublin
2,2025-01-27 02:00:00+00:00,4.90,1.294604,0.0,0.0,12.924953,31225.949219,6.35,dublin
3,2025-01-27 03:00:00+00:00,4.75,0.904820,0.0,0.0,13.979871,31225.949219,6.35,dublin
4,2025-01-27 04:00:00+00:00,4.30,0.051855,0.4,0.0,15.827721,31225.949219,6.35,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-02-05 19:00:00+00:00,5.40,0.800687,0.0,0.0,18.430452,33189.449219,0.00,dublin
236,2025-02-05 20:00:00+00:00,5.05,0.532287,0.0,0.0,17.902534,33189.449219,0.00,dublin
237,2025-02-05 21:00:00+00:00,4.70,0.205828,0.0,0.0,17.760811,33189.449219,0.00,dublin
238,2025-02-05 22:00:00+00:00,4.35,-0.078312,0.0,0.0,17.317459,33189.449219,0.00,dublin


## Insert the bike and weather data into Hopsworks

In [12]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 0.00% |                                      | Rows 0/24 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 87.50% |████████████████████████████    | Rows 21/24 | Elapsed Time: 00:01 | Remaining Time: 00:00

Uploading Dataframe: 100.00% |███████████████████████████████| Rows 24/24 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: bike_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [13]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |                                     | Rows 0/240 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 0.42% |▏                                | Rows 1/240 | Elapsed Time: 00:01 | Remaining Time: 04:42

Uploading Dataframe: 100.00% |█████████████████████████████| Rows 240/240 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: weather_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions


2025-01-27 08:39:47,215 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED


2025-01-27 08:39:50,411 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED


2025-01-27 08:39:53,615 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


2025-01-27 08:41:49,957 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED


2025-01-27 08:41:50,121 INFO: Waiting for log aggregation to finish.


2025-01-27 08:42:12,266 INFO: Execution finished successfully.
