### 0. Imports and Variable Setup

In [20]:

import os
import warnings
import hopsworks
import matplotlib.pyplot as plt
import datetime
import pandas as pd
from joblib import dump, load
from xgboost import XGBRegressor , plot_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  # For regression tasks
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score


In [21]:

# Get the API key from GitHub Secrets
HOPSWORKS_API_KEY = os.getenv('HOPSWORKS_API_KEY')

# with open('hopsworks-api-key.txt', 'r') as file:
#     os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

In [22]:
# Connect to Hopsworks Project and Features Store
project = hopsworks.login(project="ScalableMLandDeepLcourse")
fs = project.get_feature_store()

2024-12-29 15:28:41,406 INFO: Closing external client and cleaning up certificates.
Connection closed.
2024-12-29 15:28:41,416 INFO: Initializing external client
2024-12-29 15:28:41,417 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-29 15:28:42,546 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1170582


In [23]:
# Creating a directory for the model artifacts if it doesn't exist
model_dir = "trafic_pred_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

## 4. Batch Inference

In [25]:
mr = project.get_model_registry()

retrieved_model = mr.get_model(
    name="trafic_pred_tree",
    version=1,
)

# Download the saved model artifacts to a local directory
saved_model_dir = retrieved_model.download()

Downloading model artifact (1 dirs, 1 files)... DONE

In [26]:
# Assuming the downloaded directory contains 'model.pkl'
model_file_path = os.path.join(saved_model_dir, "decision_tree_model.joblib")

# Load the model
trafic_pred_tree = load(model_file_path)

In [27]:
today = datetime.datetime.now() - datetime.timedelta(0)
today = today.replace(minute=0, second=0, microsecond=0)
today = today - datetime.timedelta(hours = 3)
today

datetime.datetime(2024, 12, 29, 12, 0)

In [28]:
trafic_fg = fs.get_feature_group(name='stockholm_traffic',version=1,)
weather_fg = fs.get_feature_group(name="stockholm_weather_holiday", version=1)

trafic_df = trafic_fg.filter(trafic_fg.date == today).read()
weather_df = weather_fg.filter(weather_fg.date == today).read()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.10s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.55s) 


In [29]:

# Label Encoding
label_encoder_FRC = LabelEncoder()

trafic_df["FRC"] = label_encoder_FRC.fit_transform(trafic_df["frc"])

# Supprimer l'ancienne colonne frc
trafic_df = trafic_df.drop(columns=["frc"])


In [30]:
# Label Encoding
label_encoder_coor = LabelEncoder()

trafic_df["SEG"] = label_encoder_coor.fit_transform(trafic_df["coordinates"])

# Supprimer l'ancienne colonne coordinates
trafic_df = trafic_df.drop(columns=["coordinates"])

In [31]:
merged_df = pd.merge(trafic_df, weather_df, on="date", how="inner")

In [32]:
merged_df['heure'] = pd.to_datetime(merged_df['date']).dt.hour

In [33]:
merged_df = merged_df.drop(columns=["date"],axis=1)

In [34]:
merged_df

Unnamed: 0,currentspeed,freeflowspeed,currenttraveltime,freeflowtraveltime,confidence,roadclosure,relativespeed,FRC,SEG,temperature_2m_max,temperature_2m_min,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,holiday_status,heure
0,14,20,6,4,0.93,False,0.700000,2,111,7.876,6.026,0.3,33.48,243.37944,0,13
1,15,21,133,95,0.99,False,0.714286,3,24,7.876,6.026,0.3,33.48,243.37944,0,13
2,27,27,69,69,1.00,False,1.000000,2,33,7.876,6.026,0.3,33.48,243.37944,0,13
3,23,23,21,21,1.00,False,1.000000,2,40,7.876,6.026,0.3,33.48,243.37944,0,13
4,11,13,26,22,1.00,False,0.846154,2,38,7.876,6.026,0.3,33.48,243.37944,0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,23,23,23,23,1.00,False,1.000000,3,106,7.876,6.026,0.3,33.48,243.37944,0,13
162,21,21,25,25,1.00,False,1.000000,5,45,7.876,6.026,0.3,33.48,243.37944,0,13
163,19,19,22,22,1.00,False,1.000000,6,4,7.876,6.026,0.3,33.48,243.37944,0,13
164,17,28,120,73,0.99,False,0.607143,4,51,7.876,6.026,0.3,33.48,243.37944,0,13


In [35]:
merged_df['predicted_rs'] = trafic_pred_tree.predict(merged_df)

In [36]:
batch_df = merged_df[['SEG', 'predicted_rs']]

In [37]:
batch_df

Unnamed: 0,SEG,predicted_rs
0,111,1.000000
1,24,0.714286
2,33,0.629630
3,40,1.000000
4,38,0.571429
...,...,...
161,106,1.000000
162,45,1.000000
163,4,1.000000
164,51,0.633333


In [38]:
# Get or create feature group
monitor_fg = fs.get_or_create_feature_group(
    name='rs_predictions',
    description='Trafic prediction monitoring',
    version=1,
    primary_key=['SEG'],
)

In [39]:
monitor_fg.insert(batch_df)




Uploading Dataframe: 100.00% |██████████| Rows 166/166 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: rs_predictions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1170582/jobs/named/rs_predictions_1_offline_fg_materialization/executions


(Job('rs_predictions_1_offline_fg_materialization', 'SPARK'), None)