In [1]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

today = datetime.datetime.now()

2025-01-28 07:43:19,883 INFO: Initializing external client


2025-01-28 07:43:19,883 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-01-28 07:43:21,918 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [4]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [5]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [6]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("main", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x16f2eb38400>,
 <git.remote.FetchInfo at 0x16f2eb39490>]

## Get the latest datetime present in the bike data

In [7]:
# last_bike_datetime = "2025-01-06 15:06:11 UTC"

bike_df = bike_fg.read()

last_bike_datetime = bike_df["datetime"].max()
last_bike_datetime = last_bike_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

last_bike_datetime

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Reading data from Hopsworks, using Hopsworks Feature Query Service   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.96s) 


'2025-01-27 00:00:00 UTC'

## Loop through the commits and convert the bike data into a dataframe

In [8]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [9]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit:  dbc421663b7bd93e7b3b366eb1ed691b9edb4105  -  2025-01-27 23:49:07+00:00


Processing commit:  2367a78b77be44f2b6826af9676114be6729dab4  -  2025-01-27 22:49:12+00:00


Processing commit:  74580eca28e79de235f241cf6f2dd3728d909740  -  2025-01-27 21:49:04+00:00


Processing commit:  c22d3f1647812da983a2c7879ce9da07862be70a  -  2025-01-27 20:49:05+00:00


Processing commit:  ad949745c4e1a1e259e78222999dd7b684fbbecc  -  2025-01-27 19:49:05+00:00


Processing commit:  5e3db92adb89dda5782cad423e68e184dc8670c2  -  2025-01-27 18:49:19+00:00


Processing commit:  26c10c752ca3cec0f4c2079739748e36f9040f32  -  2025-01-27 17:49:02+00:00


Processing commit:  2e22a5e7df14213a27371034d36e7b93f4373205  -  2025-01-27 16:49:09+00:00


Processing commit:  96e3029a413091600b889e730eeece5e482d3472  -  2025-01-27 15:49:01+00:00


Processing commit:  457fecf3afeedc72916d28b49e6058f17bfd3d79  -  2025-01-27 14:49:12+00:00


Processing commit:  efddc314c43f79763c192780af54eb63f249c86e  -  2025-01-27 13:49:06+00:00


Processing commit:  d9233584c7a4bbb5a8705eb8b2bbaa499106d267  -  2025-01-27 12:46:25+00:00


Processing commit:  685ba0be54a998c5e0f28d63a3be6b2682c30db9  -  2025-01-27 11:48:55+00:00


Processing commit:  e1f9f6210bd535082e629989ef9aabf0e5edf554  -  2025-01-27 10:49:05+00:00


Processing commit:  7a7d4bb5c95e2bca25e33b759ba474d9d5f70774  -  2025-01-27 09:49:21+00:00


Processing commit:  af72e84936d91668d4c16b3b0b021d330c588df8  -  2025-01-27 08:49:27+00:00


Processing commit:  e93f5e85cf9e30c7b78f1028e396e3c9b1ea042e  -  2025-01-27 07:49:04+00:00


Processing commit:  5c2d3fc3f02dd6ecde6a48a5b430286ed808be6b  -  2025-01-27 06:49:42+00:00


Processing commit:  2130060903aea09fd7d99d45f7ff0cd10e95fa10  -  2025-01-27 05:49:06+00:00


Processing commit:  0d7eb7305eab3d50bd84c4ad8cebda989d7ec800  -  2025-01-27 04:49:02+00:00


Processing commit:  6ffe76e30fc28dd6ff5b37f1b4c422add06e868a  -  2025-01-27 03:49:05+00:00


Processing commit:  b693746a1c800b008a6b310cb33c19fe059559ee  -  2025-01-27 02:57:34+00:00


Processing commit:  8a88a933e69347b3d1bb3a5afee2fc9b0cd26d3f  -  2025-01-27 01:33:40+00:00


Processing commit:  cf2a177b269aaafa93963f0cc4ba97c0d8fddb59  -  2025-01-27 00:49:45+00:00


breaking at:  2025-01-26 23:49:04+00:00
{'CLARENDON ROW': [[19, datetime.datetime(2025, 1, 27, 23, 49, 7, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FC966E0>)], [19, datetime.datetime(2025, 1, 27, 22, 49, 12, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FC95720>)], [20, datetime.datetime(2025, 1, 27, 21, 49, 4, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FCAFF70>)], [21, datetime.datetime(2025, 1, 27, 20, 49, 5, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FCAFD30>)], [21, datetime.datetime(2025, 1, 27, 19, 49, 5, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FCAFE50>)], [25, datetime.datetime(2025, 1, 27, 18, 49, 19, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FDE9450>)], [28, datetime.datetime(2025, 1, 27, 17, 49, 2, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FDFBEB0>)], [22, datetime.datetime(2025, 1, 27, 16, 49, 9, tzinfo=<git.objects.util.tzoffset object at 0x0000016F2FDFBF70>)], [21, datetime.datetime(2025

In [10]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])

# if empty, do nothing
if not df_bike_today.empty:
    df_bike_today.dropna(inplace=True)
    df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
    df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
    df_bike_today['datetime'] = pd.to_datetime(df_bike_today['datetime'], utc=True).dt.floor('H') + datetime.timedelta(hours=1)
df_bike_today


Unnamed: 0,num_bikes_available,datetime,station
0,40.0,2025-01-28 00:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
1,40.0,2025-01-27 23:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
2,39.0,2025-01-27 22:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
3,39.0,2025-01-27 21:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
4,37.0,2025-01-27 20:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
5,39.0,2025-01-27 19:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
6,34.0,2025-01-27 18:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
7,21.0,2025-01-27 17:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
8,9.0,2025-01-27 16:00:00+00:00,HEUSTON_BRIDGE_(NORTH)
9,17.0,2025-01-27 15:00:00+00:00,HEUSTON_BRIDGE_(NORTH)


## Fetch the weather data for the same time period

In [11]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}


Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-28 00:00:00+00:00,5.60,4.084646,0.0,0.0,2.276840,31429.511719,4.8,dublin
1,2025-01-28 01:00:00+00:00,5.40,3.280367,0.0,0.0,6.130579,31429.511719,4.8,dublin
2,2025-01-28 02:00:00+00:00,5.30,2.636153,0.0,0.0,9.931042,31429.511719,4.8,dublin
3,2025-01-28 03:00:00+00:00,5.35,2.213164,0.0,0.0,13.246826,31429.511719,4.8,dublin
4,2025-01-28 04:00:00+00:00,5.75,2.258863,1.1,0.0,16.263872,31429.511719,4.8,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-02-06 19:00:00+00:00,7.85,3.175895,0.0,0.0,20.873791,33421.378906,0.0,dublin
236,2025-02-06 20:00:00+00:00,7.50,2.722842,0.0,0.0,21.650938,33421.378906,0.0,dublin
237,2025-02-06 21:00:00+00:00,7.15,2.268211,0.0,0.0,22.183128,33421.378906,0.0,dublin
238,2025-02-06 22:00:00+00:00,6.75,1.752871,0.0,0.0,22.725668,33421.378906,0.0,dublin


## Insert the bike and weather data into Hopsworks

In [12]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 0.00% |                                      | Rows 0/24 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 100.00% |███████████████████████████████| Rows 24/24 | Elapsed Time: 00:00 | Remaining Time: 00:00




Launching job: bike_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [13]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |                                     | Rows 0/240 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 100.00% |█████████████████████████████| Rows 240/240 | Elapsed Time: 00:00 | Remaining Time: 00:00




Launching job: weather_data_1_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions


2025-01-28 07:44:33,614 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED


2025-01-28 07:44:36,812 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED


2025-01-28 07:44:39,985 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


2025-01-28 07:46:47,474 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED


2025-01-28 07:46:47,628 INFO: Waiting for log aggregation to finish.


2025-01-28 07:47:12,979 INFO: Execution finished successfully.
