In [1]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

today = datetime.datetime.now()

2025-01-09 17:33:35,571 INFO: Initializing external client


2025-01-09 17:33:35,574 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-01-09 17:33:37,677 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [4]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [5]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [6]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("main", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x1a2b16cad40>,
 <git.remote.FetchInfo at 0x1a2b16c9d50>]

## Get the latest datetime present in the bike data

In [7]:
# last_bike_datetime = "2025-01-06 15:06:11 UTC"

bike_df = bike_fg.read()

last_bike_datetime = bike_df["datetime"].max()
last_bike_datetime = last_bike_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

last_bike_datetime

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Reading data from Hopsworks, using Hopsworks Feature Query Service   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.85s) 


'2025-01-08 15:00:00 UTC'

## Loop through the commits and convert the bike data into a dataframe

In [8]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [9]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit:  21db2546ac018ac31d456d3a5fa48c52ff46e0d1  -  2025-01-08 23:49:15+00:00


Processing commit:  91b0284e997a45bd28b2c3b6e7966e1f866481ac  -  2025-01-08 22:48:55+00:00


Processing commit:  ffd2ce97c17e53c6b69f86548fb4b2592e3f5e1f  -  2025-01-08 21:49:15+00:00


Processing commit:  10f0bea3d5d9ff585e4c5786a2fcda5a7e4ede7d  -  2025-01-08 20:49:13+00:00


Processing commit:  1f0175f846ee69fa126ad23505d65797c1dc9cc5  -  2025-01-08 19:49:03+00:00


Processing commit:  68d019e4ae8e061f2fb118a0947baf13452dd65c  -  2025-01-08 18:49:37+00:00


Processing commit:  7e898cc4d406b4f640ec7afbe1508793ebd64180  -  2025-01-08 17:49:05+00:00


Processing commit:  4521a2e53821bbe732dd479aa6dfcaa71924a717  -  2025-01-08 16:50:13+00:00


Processing commit:  9a8bab64b38c97086d5c65b8b8ef70f26df7fb99  -  2025-01-08 15:49:13+00:00


breaking at:  2025-01-08 14:49:05+00:00
{'CLARENDON ROW': [[0, datetime.datetime(2025, 1, 8, 23, 49, 15, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B17841F0>)], [0, datetime.datetime(2025, 1, 8, 22, 48, 55, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B1784250>)], [0, datetime.datetime(2025, 1, 8, 21, 49, 15, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B1784070>)], [0, datetime.datetime(2025, 1, 8, 20, 49, 13, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B1785D50>)], [0, datetime.datetime(2025, 1, 8, 19, 49, 3, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B179A140>)], [0, datetime.datetime(2025, 1, 8, 18, 49, 37, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B179F700>)], [0, datetime.datetime(2025, 1, 8, 17, 49, 5, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B18DCD00>)], [0, datetime.datetime(2025, 1, 8, 16, 50, 13, tzinfo=<git.objects.util.tzoffset object at 0x000001A2B18E62C0>)], [5, datetime.datetime(2025, 1, 8, 15, 4

In [10]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])

# if empty, do nothing
if not df_bike_today.empty:
    df_bike_today.dropna(inplace=True)
    df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
    df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
    df_bike_today


## Fetch the weather data for the same time period

In [11]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-09 00:00:00+00:00,-2.30,-6.498223,0.0,0.0,10.966713,28145.916016,0.0,dublin
1,2025-01-09 01:00:00+00:00,-2.50,-6.804034,0.0,0.0,11.525623,28145.916016,0.0,dublin
2,2025-01-09 02:00:00+00:00,-2.75,-7.173875,0.0,0.0,12.261158,28145.916016,0.0,dublin
3,2025-01-09 03:00:00+00:00,-2.85,-7.443090,0.0,0.0,13.339445,28145.916016,0.0,dublin
4,2025-01-09 04:00:00+00:00,-2.75,-7.531152,0.0,0.0,14.830076,28145.916016,0.0,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-01-18 19:00:00+00:00,8.75,5.630491,0.0,0.0,15.379206,29524.685547,0.0,dublin
236,2025-01-18 20:00:00+00:00,8.30,5.193879,0.0,0.0,14.332340,29524.685547,0.0,dublin
237,2025-01-18 21:00:00+00:00,7.95,4.828638,0.0,0.0,13.661038,29524.685547,0.0,dublin
238,2025-01-18 22:00:00+00:00,7.70,4.545562,0.0,0.0,13.276144,29524.685547,0.0,dublin


## Insert the bike and weather data into Hopsworks

In [12]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 0.00% |                                       | Rows 0/9 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 44.44% |███████████████                   | Rows 4/9 | Elapsed Time: 00:01 | Remaining Time: 00:01

Uploading Dataframe: 100.00% |█████████████████████████████████| Rows 9/9 | Elapsed Time: 00:01 | Remaining Time: 00:00




Launching job: bike_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [13]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |                                     | Rows 0/240 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 0.42% |▏                                | Rows 1/240 | Elapsed Time: 00:01 | Remaining Time: 04:53

Uploading Dataframe: 100.00% |█████████████████████████████| Rows 240/240 | Elapsed Time: 00:02 | Remaining Time: 00:00




Launching job: weather_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions


2025-01-09 17:34:59,139 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED


2025-01-09 17:35:02,317 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


2025-01-09 17:36:35,224 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED


2025-01-09 17:36:38,410 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED


2025-01-09 17:36:38,615 INFO: Waiting for log aggregation to finish.


2025-01-09 17:37:00,868 INFO: Execution finished successfully.
