In [14]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from functions import utils
import json
import os
import git
import warnings
warnings.filterwarnings("ignore")

In [15]:
from dotenv import load_dotenv
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_API_KEY"] = HOPSWORKS_API_KEY

## Get the necessary data from Hopsworks

In [16]:
project = hopsworks.login()
fs = project.get_feature_store() 
# secrets = utils.secrets_api(project.name)

CITY = "dublin"
STATION = "HEUSTON BRIDGE (NORTH)"

# latitude =
# longitude =

earliest_bike_datetime = "2023-08-05 12:56:53 UTC"
last_bike_datetime = "2025-01-06 15:06:11 UTC"

today = datetime.datetime.now()

2025-01-08 11:08:41,576 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-08 11:08:41,586 INFO: Initializing external client
2025-01-08 11:08:41,590 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-08 11:08:42,927 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207494


In [17]:
# Retrieve feature groups
bike_fg = fs.get_feature_group(
    name='bike_data',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data',
    version=1,
)


## Clone and pull the repository with the bike data

In [18]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

In [19]:
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout("master", force=True)
repo.remotes.origin.pull()

[<git.remote.FetchInfo at 0x28c67d53650>,
 <git.remote.FetchInfo at 0x28c691bb330>]

## Loop through the commits and convert the bike data into a dataframe

In [20]:
results = {}

# Populate the results dict with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [21]:
start_time = datetime.datetime.now()
last_day_and_hour = None

for commit in repo.iter_commits():
    # Stop when we reach the earliest bike date
    if commit.committed_datetime <= datetime.datetime.strptime(last_bike_datetime, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=datetime.timezone.utc):
        print("breaking at: ", commit.committed_datetime)
        break

    # Skip commits from today
    if commit.committed_datetime > today.replace(tzinfo=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0):
        first_commited_datetime = repo.commit().committed_datetime
        continue

    day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
    # print("day_and_hour: ", day_and_hour, " - last_day_and_hour: ", last_day_and_hour)
    if day_and_hour == last_day_and_hour:
        continue
    last_day_and_hour = day_and_hour

    # Get the data for the commit
    print("Processing commit: ", commit.hexsha, " - ", commit.committed_datetime)
    try:
        repo.git.checkout(commit.hexsha, force=True)
    except Exception as e:
        print("Error checking out commit: ", e)
        break

    data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
    if os.path.exists(data_file):
        with open(data_file, "r") as f:
            data = f.read()
            data = json.loads(data)
            for feature in data["features"]:
                try:
                    results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                except KeyError:
                    continue

print(results)


Processing commit:  89b276ca17529039cace86588d6e20564833d373  -  2025-01-07 23:49:02+00:00
Processing commit:  025c435e8c14f836df2fac69b0f3db0dc79650cd  -  2025-01-07 22:49:09+00:00
Processing commit:  a32ce41a032b4e4b6c17466b39f6bf6debf644bf  -  2025-01-07 21:49:03+00:00
Processing commit:  071ade0578359949fccf110eb29f26c8ce36a12e  -  2025-01-07 20:49:06+00:00
Processing commit:  8e3ca6582fbe92b31705d047bd4c970e08def936  -  2025-01-07 19:49:06+00:00
Processing commit:  45efef53d58decb7603aff6dfbed2809b99c765d  -  2025-01-07 18:49:35+00:00
Processing commit:  ee2cd20c0654ad505f6381413fc6129cedb2b56d  -  2025-01-07 17:49:03+00:00
Processing commit:  14f47fa993400d64943ba26771053b30112444f4  -  2025-01-07 16:56:13+00:00
Processing commit:  5a748e85fe82ef6bd73594b6016243a16a39bc0b  -  2025-01-07 15:38:37+00:00
Processing commit:  dc4bd316c0fcde09a6118d5a56795f88a31b5eb0  -  2025-01-07 14:15:43+00:00
Processing commit:  ff4c2edb75445720e19ec9926c6d20bac286f260  -  2025-01-07 13:25:27+00:00

In [22]:
# turn results into a dataframe
df_bike_today = pd.DataFrame()

for station, values in results.items():
    if len(values) > 0:
        df = pd.DataFrame(values, columns=["num_bikes_available", "datetime"])
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        df["station"] = station.replace(" ", "_")
        df_bike_today = pd.concat([df_bike_today, df])
df_bike_today.dropna(inplace=True)
df_bike_today["num_bikes_available"] = df_bike_today["num_bikes_available"].astype("float32")
df_bike_today = df_bike_today[df_bike_today['station'].isin([STATION.replace(" ", "_")])]
df_bike_today


Unnamed: 0,num_bikes_available,datetime,station
0,39.0,2025-01-07 23:49:02+00:00,HEUSTON_BRIDGE_(NORTH)
1,39.0,2025-01-07 22:49:09+00:00,HEUSTON_BRIDGE_(NORTH)
2,40.0,2025-01-07 21:49:03+00:00,HEUSTON_BRIDGE_(NORTH)
3,40.0,2025-01-07 20:49:06+00:00,HEUSTON_BRIDGE_(NORTH)
4,40.0,2025-01-07 19:49:06+00:00,HEUSTON_BRIDGE_(NORTH)
5,40.0,2025-01-07 18:49:35+00:00,HEUSTON_BRIDGE_(NORTH)
6,35.0,2025-01-07 17:49:03+00:00,HEUSTON_BRIDGE_(NORTH)
7,20.0,2025-01-07 16:56:13+00:00,HEUSTON_BRIDGE_(NORTH)
8,18.0,2025-01-07 15:38:37+00:00,HEUSTON_BRIDGE_(NORTH)
9,22.0,2025-01-07 14:15:43+00:00,HEUSTON_BRIDGE_(NORTH)


## Fetch the weather data for the same time period

In [23]:
forecast_df = utils.get_hourly_weather_forecast(CITY)
forecast_df = forecast_df.rename(columns={'date_x': 'datetime'})
forecast_df = forecast_df.drop(columns=['date_y', 'date_only'])
forecast_df.dropna(inplace=True)

print(forecast_df.empty)

forecast_df

features: {'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
params: {'latitude': 53.35, 'longitude': -6.26, 'hourly': ['temperature_2m', 'apparent_temperature', 'rain', 'snowfall', 'wind_speed_10m'], 'daily': ['daylight_duration', 'rain_sum']}
Coordinates 53.5°N -6.25°E
Elevation 11.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
False


Unnamed: 0,datetime,temperature_2m,apparent_temperature,rain,snowfall,wind_speed_10m,daylight_duration,rain_sum,city
0,2025-01-08 00:00:00+00:00,-1.25,-5.474496,0.0,0.0,12.324414,28021.865234,0.0,dublin
1,2025-01-08 01:00:00+00:00,-1.45,-5.587059,0.0,0.0,11.542478,28021.865234,0.0,dublin
2,2025-01-08 02:00:00+00:00,-1.60,-5.629602,0.0,0.0,10.799999,28021.865234,0.0,dublin
3,2025-01-08 03:00:00+00:00,-1.70,-5.586253,0.0,0.0,9.726665,28021.865234,0.0,dublin
4,2025-01-08 04:00:00+00:00,-1.85,-5.546029,0.0,0.0,8.287822,28021.865234,0.0,dublin
...,...,...,...,...,...,...,...,...,...
235,2025-01-17 19:00:00+00:00,5.05,1.534287,0.0,0.0,14.578890,29351.843750,0.0,dublin
236,2025-01-17 20:00:00+00:00,4.85,1.368568,0.0,0.0,14.277983,29351.843750,0.0,dublin
237,2025-01-17 21:00:00+00:00,4.60,1.185653,0.0,0.0,13.479583,29351.843750,0.0,dublin
238,2025-01-17 22:00:00+00:00,4.30,0.943740,0.0,0.0,12.682018,29351.843750,0.0,dublin


## Insert the bike and weather data into Hopsworks

In [24]:
if df_bike_today.empty:
    print("No bike data available for today")
else:
    bike_fg.insert(df_bike_today)

Uploading Dataframe: 100.00% |██████████| Rows 33/33 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: bike_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/bike_data_1_offline_fg_materialization/executions


In [25]:
if forecast_df.empty:
    print("No weather forecast available for today")
else:
    weather_fg.insert(forecast_df, write_options={"wait_for_job": True})


Uploading Dataframe: 100.00% |██████████| Rows 240/240 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1207494/jobs/named/weather_data_1_offline_fg_materialization/executions
2025-01-08 11:09:28,482 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-01-08 11:09:31,718 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-01-08 11:11:00,979 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-01-08 11:11:01,146 INFO: Waiting for log aggregation to finish.
2025-01-08 11:11:23,108 INFO: Execution finished successfully.


## Change the last_bike_datetime secret

In [26]:
"""
if not df_bike_today.empty:
    try:
        secrets.get_secret("time_secrets").delete()
    except:
        pass

    new_last_bike_datetime_raw = pd.Series.max(df_bike_today['datetime'])
    new_last_bike_datetime = new_last_bike_datetime_raw.strftime('%Y-%m-%d %H:%M:%S %Z')

    time_secrets_dict = {
        "earliest_bike_datetime": earliest_bike_datetime,
        "last_bike_datetime": new_last_bike_datetime
    }

    secrets.create_secret("time_secrets", json.dumps(time_secrets_dict))
""" 

'\nif not df_bike_today.empty:\n    try:\n        secrets.get_secret("time_secrets").delete()\n    except:\n        pass\n\n    new_last_bike_datetime_raw = pd.Series.max(df_bike_today[\'datetime\'])\n    new_last_bike_datetime = new_last_bike_datetime_raw.strftime(\'%Y-%m-%d %H:%M:%S %Z\')\n\n    time_secrets_dict = {\n        "earliest_bike_datetime": earliest_bike_datetime,\n        "last_bike_datetime": new_last_bike_datetime\n    }\n\n    secrets.create_secret("time_secrets", json.dumps(time_secrets_dict))\n'