# Real-time Inference

This script fetches the latest data from our realtime feature store and runs predictions.

We don't need Quix, we can just use Confluent-Kafka

In [93]:
from confluent_kafka import Consumer
import json

def fetch_all_feature_records():
    # Kafka Consumer configuration for reading from the beginning of the topic
    conf = {
        'bootstrap.servers': "localhost:9092",
        'group.id': "feature_store_reader",
        'auto.offset.reset': 'latest'
    }

    # Initialize Kafka Consumer and subscribe to the topic
    consumer = Consumer(conf)
    consumer.subscribe(['feature_store'])

    feature_records = []  # List to store feature data

    try:
        while True:
            msg = consumer.poll(1.0)  # Poll for messages with a 1-second timeout
            if msg is None:
                break  # Exit loop if no more messages
            if not msg.error():
                # Convert message from JSON and add to list
                feature_records.append(json.loads(msg.value().decode('utf-8')))
            else:
                break  # Exit loop on error
    finally:
        consumer.close()  # Clean up: close consumer

    return feature_records  # Return the collected feature records

In [97]:
import pandas as pd

# Assuming `all_features` is the list of dictionaries returned from the function
df = pd.DataFrame(fetch_all_feature_records()).tail(4)
df

Unnamed: 0,id,lag_1,lag_2,lag_6,lag_12,lag_24,rolling_mean_7,rolling_std_7,temperature_forecast,hour,day_of_week,month
0,2024-01-01 09,4396.0,4538.0,5112.0,5405.0,4768.0,4677.428571,296.064102,,,,
1,2024-01-01 09,,,,,,,,-8.3,9.0,0.0,1.0


In [98]:
df_agg = df.groupby("id").agg("max").reset_index().set_index('id')
df_agg

Unnamed: 0_level_0,lag_1,lag_2,lag_6,lag_12,lag_24,rolling_mean_7,rolling_std_7,temperature_forecast,hour,day_of_week,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-01 09,4396.0,4538.0,5112.0,5405.0,4768.0,4677.428571,296.064102,-8.3,9.0,0.0,1.0


In [99]:
# Convert the index to datetime 
df_agg.index = pd.to_datetime(df_agg.index)

# Drop missing values (incomplete streaming data)
df_agg = df_agg.dropna()

# Select the row with the latest date
latest_feature = df_agg.sort_index(ascending = False).head(1)

latest_feature

Unnamed: 0_level_0,lag_1,lag_2,lag_6,lag_12,lag_24,rolling_mean_7,rolling_std_7,temperature_forecast,hour,day_of_week,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-01 09:00:00,4396.0,4538.0,5112.0,5405.0,4768.0,4677.428571,296.064102,-8.3,9.0,0.0,1.0


In [100]:
# Load model and run prediction
# Read latest model from model registry
import json

file_path = 'models/model_registry.json'

with open(file_path, 'r') as file:
    # Step 3: Load and parse the JSON data
    model_registry = json.load(file)

# Latest model
latest_model = model_registry[0]['model']
latest_model


'models/energy_demand_model_v1.pkl'

In [75]:
import joblib
def load_model(model_path):
  model = joblib.load(model_path)
  return model

In [101]:
model = load_model(latest_model)

In [104]:
feature_names = ['lag_1', 'lag_2', 'lag_6', 'lag_12', 'lag_24', 'rolling_mean_7', 'rolling_std_7', 'hour', 'day_of_week', 'month', 'temperature_forecast']
latest_feature = latest_feature[feature_names]

In [105]:
model.predict(latest_feature)

array([4516.791], dtype=float32)

Complete pipeline - get predictions from latest available complete data (temperature + energy demand)

In [108]:
import pandas as pd

df = (pd.DataFrame(fetch_all_feature_records())
      .tail(4)
      .groupby("id")
      .agg("max")
      .reset_index()
      .set_index('id')
      .dropna())

# Convert the index to datetime and then sort and select the latest row
latest_feature = (df
                 .assign(index=pd.to_datetime(df.index))  # Convert index to datetime
                 .sort_index(ascending=False)  # Sort by index
                 .head(1)[feature_names])  # Select the row with the latest date

print("Prediction based on data from:" + latest_feature.index)

model.predict(latest_feature)

Index(['Prediction based on data from:2024-01-05 12'], dtype='object', name='id')


array([4620.6772], dtype=float32)

Voilà! We can always get the latest predictions from our streaming data!