# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 04: Batch Predictions</span>


## 🗒️ In this notebook we will see how to create a training dataset from the feature groups: 

1. Loading the training data.
2. Train the model.
3. Register model in Hopsworks model registry.

![part3](images/03_model.png) 

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/167
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;"> 🪝 Feature View and Training Dataset Retrieval </span>

In [2]:
feature_view = fs.get_feature_view(
    name = 'air_quality_fv',
    version = 1
)

In [3]:
train_data = feature_view.get_training_data(1)[0]
train_data.head()

Unnamed: 0,city,aqi,date,iaqi_h,iaqi_p,iaqi_pm10,iaqi_t,o3_avg,o3_max,o3_min,...,windgust,windspeed,winddir,pressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions
0,2,16,1663027200000,0.296339,-0.769711,-0.228696,0.174822,-0.262875,-0.295896,-0.095551,...,1.292511,0.49434,-0.629129,-0.94677,1.251215,-1.073768,-1.719523,-1.726476,-1.878691,2
1,1,4,1662584400000,0.809041,1.236292,-0.41581,-2.566292,-0.047795,0.430394,0.217162,...,-0.875157,-0.986137,-0.688734,1.13537,-1.721077,0.602495,0.423267,0.419426,0.285561,0
2,1,19,1662940800000,-0.899964,0.344735,2.390909,0.723045,0.167284,-0.295896,0.68623,...,-0.610403,-0.512995,-0.757281,0.284408,-0.321398,-1.937297,-0.079925,-0.04877,0.285561,0
3,2,8,1662584400000,-0.764249,1.28087,-0.41581,-0.949035,-0.262875,-0.295896,0.373518,...,-0.941345,-1.154026,-0.092678,1.244003,0.402761,0.602495,0.134522,0.107295,0.285561,0
4,1,16,1662930000000,-1.101023,0.344735,1.829565,0.997157,0.167284,-0.295896,0.68623,...,-0.701412,-0.757197,-0.761751,0.293461,0.073106,-1.327747,0.011257,0.029262,0.285561,0


---

## <span style="color:#ff5f27;"> 🧬 SequenceDataset </span>

In [4]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [5]:
class SequenceDataset(Dataset):
    def __init__(self, data, target, sequence_length=2):
        self.features = [column for column in [*data.columns] if column != target]
        self.target = target
        self.sequence_length = sequence_length
        self.X = torch.tensor(data.drop(target, axis = 1).values.astype(np.float32)) 
        self.y = torch.tensor(data[self.target])
        
    def __repr__(self):
        return f'Features: {self.features}\nTarget: {self.target}'
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,i):
        if i < self.sequence_length:
            dist = self.sequence_length - i+1
            X = [*[self.X[0] for i in range(dist-2)],*self.X[:i+1]]
            return torch.tensor([[*obs] for obs in X]).float(),self.y[i].float()
        return self.X[i+1 - self.sequence_length:i+1].float(),self.y[i].float()

In [7]:
df_torch = SequenceDataset(
    data = train_data,
    target = 'aqi',
    sequence_length = 5
)

print(df_torch.X[0])
print(df_torch.y[0])

tensor([ 2.0000e+00,  1.6630e+12,  2.9634e-01, -7.6971e-01, -2.2870e-01,
         1.7482e-01, -2.6288e-01, -2.9590e-01, -9.5551e-02, -1.1894e+00,
        -1.5127e+00, -1.3768e+00, -9.3865e-01, -1.1514e+00, -7.9609e-01,
        -3.5355e-01, -3.5355e-01, -3.5355e-01, -9.8886e-01,  5.5164e-01,
        -1.7699e-01, -9.8886e-01,  5.9449e-01, -8.3310e-02,  8.8902e-01,
         1.2962e+00,  3.0990e+00,  1.2638e+00,  2.6862e+00,  0.0000e+00,
         0.0000e+00,  1.2925e+00,  4.9434e-01, -6.2913e-01, -9.4677e-01,
         1.2512e+00, -1.0738e+00, -1.7195e+00, -1.7265e+00, -1.8787e+00,
         2.0000e+00])
tensor(16)


In [8]:
df_torch[2]

(tensor([[ 2.0000e+00,  1.6630e+12,  2.9634e-01, -7.6971e-01, -2.2870e-01,
           1.7482e-01, -2.6288e-01, -2.9590e-01, -9.5551e-02, -1.1894e+00,
          -1.5127e+00, -1.3768e+00, -9.3865e-01, -1.1514e+00, -7.9609e-01,
          -3.5355e-01, -3.5355e-01, -3.5355e-01, -9.8886e-01,  5.5164e-01,
          -1.7699e-01, -9.8886e-01,  5.9449e-01, -8.3310e-02,  8.8902e-01,
           1.2962e+00,  3.0990e+00,  1.2638e+00,  2.6862e+00,  0.0000e+00,
           0.0000e+00,  1.2925e+00,  4.9434e-01, -6.2913e-01, -9.4677e-01,
           1.2512e+00, -1.0738e+00, -1.7195e+00, -1.7265e+00, -1.8787e+00,
           2.0000e+00],
         [ 2.0000e+00,  1.6630e+12,  2.9634e-01, -7.6971e-01, -2.2870e-01,
           1.7482e-01, -2.6288e-01, -2.9590e-01, -9.5551e-02, -1.1894e+00,
          -1.5127e+00, -1.3768e+00, -9.3865e-01, -1.1514e+00, -7.9609e-01,
          -3.5355e-01, -3.5355e-01, -3.5355e-01, -9.8886e-01,  5.5164e-01,
          -1.7699e-01, -9.8886e-01,  5.9449e-01, -8.3310e-02,  8.8902e-01,
 

In [9]:
df_torch[5]

(tensor([[ 1.0000e+00,  1.6626e+12,  8.0904e-01,  1.2363e+00, -4.1581e-01,
          -2.5663e+00, -4.7795e-02,  4.3039e-01,  2.1716e-01, -3.5798e-01,
          -2.9577e-01, -8.4382e-01, -5.1091e-01, -4.5017e-01, -3.0197e-01,
           2.8284e+00,  2.8284e+00,  2.8284e+00, -2.8270e-01, -1.2183e+00,
          -9.4933e-01, -2.8270e-01, -1.1463e+00, -8.6570e-01, -9.4565e-01,
          -4.3875e-01, -5.2007e-01, -6.5182e-01, -6.4840e-01,  0.0000e+00,
           0.0000e+00, -8.7516e-01, -9.8614e-01, -6.8873e-01,  1.1354e+00,
          -1.7211e+00,  6.0250e-01,  4.2327e-01,  4.1943e-01,  2.8556e-01,
           0.0000e+00],
         [ 1.0000e+00,  1.6629e+12, -8.9996e-01,  3.4474e-01,  2.3909e+00,
           7.2305e-01,  1.6728e-01, -2.9590e-01,  6.8623e-01, -3.5798e-01,
          -6.0000e-01,  2.2206e-01, -5.1091e-01, -6.8391e-01, -3.0197e-01,
          -3.5355e-01, -3.5355e-01, -3.5355e-01,  1.0466e+00, -8.5828e-01,
          -1.2872e-01,  1.0466e+00, -6.0871e-01, -8.3310e-02,  3.9638e-02,
 

---

## <span style="color:#ff5f27;">🧑🏻‍🔬 DataLoader </span>

In [10]:
loader_train = DataLoader(df_torch,batch_size = 3)

X, y = next(iter(loader_train))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: torch.Size([3, 5, 41])
Target shape: torch.Size([3])


---

## <span style="color:#ff5f27;">🤖 Model Building </span>

In [11]:
class LSTMModel(nn.Module):
    def __init__(self,n_features,num_layers,hidden_size):
        super().__init__()
        self.n_features=n_features
        self.num_layers=num_layers
        self.hidden_size=hidden_size
        
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            batch_first=True,
            num_layers=self.num_layers
        )
        
        self.linear = nn.Linear(
            in_features=hidden_size,
            out_features=1
        )
        
    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_()
        
        _, (hn, _) = self.lstm(x, (h0, c0))
        out = self.linear(hn[0]).flatten()  

        return out

In [12]:
model = LSTMModel(
    n_features=len(df_torch.X[0]),
    num_layers=32,
    hidden_size=16
)

learning_rate = 0.05
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()

---

## <span style="color:#ff5f27;">👨🏻‍⚖️ Model Evaluation </span>

In [13]:
def train_model(data_loader, model, loss_function, optimizer):
    num_batches = len(data_loader)
    total_loss = .0
    model.train()

    for X, y in data_loader:
        output = model(X)

        loss = loss_function(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")

def test_model(data_loader, model, loss_function):

    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            output = model(X)
            total_loss += loss_function(output, y).item()

    avg_loss = total_loss / num_batches
    print(f"Test loss: {avg_loss}\n")

In [14]:
for epoch in range(10):
    print(f"Epoch {epoch}\n---------")
    train_model(loader_train, model, loss_function, optimizer=optimizer)

Epoch 0
---------
Train loss: 230.4072945912679
Epoch 1
---------
Train loss: 199.49027304848036
Epoch 2
---------
Train loss: 173.50038990626732
Epoch 3
---------
Train loss: 152.3059850037098
Epoch 4
---------
Train loss: 135.46838302413622
Epoch 5
---------
Train loss: 122.38609488805135
Epoch 6
---------
Train loss: 112.42339372634888
Epoch 7
---------
Train loss: 104.97926163673401
Epoch 8
---------
Train loss: 99.51896635691325
Epoch 9
---------
Train loss: 95.58588027954102


## <span style='color:#ff5f27'>👮🏼‍♀️ Model Registry</span>

In [15]:
mr = project.get_model_registry()

Connected. Call `.close()` to terminate connection gracefully.


In [17]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(train_data.drop('aqi',axis=1))
output_schema = Schema(train_data.aqi)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

model_schema.to_dict()

{'input_schema': {'columnar_schema': [{'name': 'city', 'type': 'object'},
   {'name': 'date', 'type': 'object'},
   {'name': 'iaqi_h', 'type': 'object'},
   {'name': 'iaqi_p', 'type': 'object'},
   {'name': 'iaqi_pm10', 'type': 'object'},
   {'name': 'iaqi_t', 'type': 'object'},
   {'name': 'o3_avg', 'type': 'object'},
   {'name': 'o3_max', 'type': 'object'},
   {'name': 'o3_min', 'type': 'object'},
   {'name': 'pm10_avg', 'type': 'object'},
   {'name': 'pm10_max', 'type': 'object'},
   {'name': 'pm10_min', 'type': 'object'},
   {'name': 'pm25_avg', 'type': 'object'},
   {'name': 'pm25_max', 'type': 'object'},
   {'name': 'pm25_min', 'type': 'object'},
   {'name': 'uvi_avg', 'type': 'object'},
   {'name': 'uvi_max', 'type': 'object'},
   {'name': 'uvi_min', 'type': 'object'},
   {'name': 'tempmax', 'type': 'object'},
   {'name': 'tempmin', 'type': 'object'},
   {'name': 'temp', 'type': 'object'},
   {'name': 'feelslikemax', 'type': 'object'},
   {'name': 'feelslikemin', 'type': 'object

In [20]:
model_dir = "./model"
torch.save(model, model_dir)

model_hops = mr.sklearn.create_model(
    name = "pytorch_model",
    #input_example = df_torch[5],
    model_schema = model_schema
)

model_hops.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/167/models/pytorch_model/1


Model(name: 'pytorch_model', version: 1)

## <span style="color:#ff5f27;">🚀 Fetch and test the model </span>

In [28]:
model = mr.get_model("pytorch_model", version = 1)
model_dir = model.download()

model = torch.load(model_dir + '/model')
model

Downloading file ... 

LSTMModel(
  (lstm): LSTM(41, 16, num_layers=32, batch_first=True)
  (linear): Linear(in_features=16, out_features=1, bias=True)
)

In [29]:
def predict(data_loader, model):

    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_star = model(X).int()
            output = torch.cat((output, y_star), 0)

    return output

In [30]:
predict(loader_train,model)

tensor([10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10.])

---