### 0. Imports and Variable Setup

In [18]:

import os
import warnings
import hopsworks
import matplotlib.pyplot as plt
import datetime
import pandas as pd
from joblib import dump, load
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  # For regression tasks
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score


In [19]:

# Get the API key from GitHub Secrets
HOPSWORKS_API_KEY = os.getenv('HOPSWORKS_API_KEY')

#with open('hopsworks-api-key.txt', 'r') as file:
#   os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

In [20]:
# Connect to Hopsworks Project and Features Store
project = hopsworks.login(project="ScalableMLandDeepLcourse")
fs = project.get_feature_store()

2025-01-08 14:47:09,296 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-08 14:47:09,313 INFO: Initializing external client
2025-01-08 14:47:09,314 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-08 14:47:10,371 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1170582


In [21]:
# Creating a directory for the model artifacts if it doesn't exist
model_dir = "trafic_pred_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

## 4. Batch Inference

In [22]:
mr = project.get_model_registry()

retrieved_model = mr.get_model(
    name="trafic_pred_xgboost",
    version=4,
)

# Download the saved model artifacts to a local directory
saved_model_dir = retrieved_model.download()

Downloading model artifact (0 dirs, 2 files)... DONE

In [23]:
# Assuming the downloaded directory contains 'model.pkl'
model_file_path = os.path.join(saved_model_dir, "xgboost_model.joblib")

# Load the model
trafic_pred_xgboost = load(model_file_path)

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.




In [24]:
today = datetime.datetime.now() - datetime.timedelta(0)
today = today.replace(minute=0, second=0, microsecond=0)
today = today - datetime.timedelta(hours = 3)
today

datetime.datetime(2025, 1, 8, 11, 0)

In [25]:
trafic_fg = fs.get_feature_group(name='stockholm_traffic',version=1,)
weather_fg = fs.get_feature_group(name="stockholm_weather_holiday", version=1)

trafic_df = trafic_fg.filter(trafic_fg.date == today).read()
weather_df = weather_fg.filter(weather_fg.date == today).read()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.10s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.85s) 


In [26]:

# Label Encoding
label_encoder_FRC = LabelEncoder()

trafic_df["FRC"] = label_encoder_FRC.fit_transform(trafic_df["frc"])


# Supprimer l'ancienne colonne frc
trafic_df = trafic_df.drop(columns=["frc"])

In [27]:
# Label Encoding
label_encoder_coor = LabelEncoder()

trafic_df["SEG"] = label_encoder_coor.fit_transform(trafic_df["coordinates"])

In [28]:
merged_df = pd.merge(trafic_df, weather_df, on="date", how="inner")

In [29]:
merged_df['heure'] = pd.to_datetime(merged_df['date']).dt.hour

In [30]:
merged_df = merged_df.drop(columns=["date"],axis=1)

In [31]:
merged_df

Unnamed: 0,currentspeed,freeflowspeed,currenttraveltime,freeflowtraveltime,confidence,roadclosure,coordinates,relativespeed,FRC,SEG,temperature_2m_max,temperature_2m_min,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,holiday_status,heure
0,24,38,48,30,1.000000,False,LINESTRING (18.0666464190154 59.34681285368509...,0.631579,1,156,2.976,-0.274,2.2,21.599998,191.639847,0,12
1,22,22,17,17,1.000000,False,LINESTRING (18.057913146460862 59.344553073483...,1.000000,3,109,2.976,-0.274,2.2,21.599998,191.639847,0,12
2,72,72,31,31,1.000000,False,LINESTRING (18.03548585577761 59.3491852516705...,1.000000,0,9,2.976,-0.274,2.2,21.599998,191.639847,0,12
3,14,19,148,109,0.929851,False,LINESTRING (18.05886801287042 59.3382794059407...,0.736842,6,113,2.976,-0.274,2.2,21.599998,191.639847,0,12
4,6,11,173,94,0.770000,False,LINESTRING (18.05543076201704 59.3469348599300...,0.545455,6,89,2.976,-0.274,2.2,21.599998,191.639847,0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,16,32,55,27,0.980000,False,"LINESTRING (18.066489509788 59.34369878970523,...",0.500000,4,153,2.976,-0.274,2.2,21.599998,191.639847,0,12
161,20,20,37,37,1.000000,False,LINESTRING (18.049820921861055 59.347588002544...,1.000000,6,66,2.976,-0.274,2.2,21.599998,191.639847,0,12
162,73,73,34,34,1.000000,False,LINESTRING (18.034840784509413 59.348585768585...,1.000000,0,6,2.976,-0.274,2.2,21.599998,191.639847,0,12
163,19,23,122,100,1.000000,False,LINESTRING (18.067642859664687 59.335295413776...,0.826087,4,159,2.976,-0.274,2.2,21.599998,191.639847,0,12


In [32]:
merged_df['predicted_rs'] = trafic_pred_xgboost.predict(merged_df.loc[:, merged_df.columns != 'coordinates'])

In [33]:
batch_df = merged_df[['SEG','coordinates','predicted_rs','relativespeed']]

In [34]:
batch_df

Unnamed: 0,SEG,coordinates,predicted_rs,relativespeed
0,156,LINESTRING (18.0666464190154 59.34681285368509...,0.695825,0.631579
1,109,LINESTRING (18.057913146460862 59.344553073483...,0.805662,1.000000
2,9,LINESTRING (18.03548585577761 59.3491852516705...,0.938617,1.000000
3,113,LINESTRING (18.05886801287042 59.3382794059407...,0.780185,0.736842
4,89,LINESTRING (18.05543076201704 59.3469348599300...,0.818123,0.545455
...,...,...,...,...
160,153,"LINESTRING (18.066489509788 59.34369878970523,...",0.657032,0.500000
161,66,LINESTRING (18.049820921861055 59.347588002544...,0.880070,1.000000
162,6,LINESTRING (18.034840784509413 59.348585768585...,0.954082,1.000000
163,159,LINESTRING (18.067642859664687 59.335295413776...,0.795522,0.826087


In [100]:
# Get or create feature group
monitor_fg = fs.get_or_create_feature_group(
    name='rs_predictions',
    description='Trafic prediction monitoring',
    version=1,
    primary_key=['SEG'],
)

In [101]:
monitor_fg.insert(batch_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1170582/fs/1161285/fg/1394717


Uploading Dataframe: 100.00% |██████████| Rows 165/165 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: rs_predictions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1170582/jobs/named/rs_predictions_1_offline_fg_materialization/executions


(Job('rs_predictions_1_offline_fg_materialization', 'SPARK'), None)