# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 04: Batch Predictions</span>


## 🗒️ In this notebook we will see how to create a training dataset from the feature groups: 

1. Loading the training data.
2. Train the model.
3. Register model in Hopsworks model registry.

![part3](images/03_model.png) 

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login() 

fs = project.get_feature_store() 

## <span style="color:#ff5f27;"> 🪝 Feature View and Training Dataset Retrieval </span>

In [None]:
feature_view = fs.get_feature_view(
    name = 'air_quality_fv',
    version = 1
)

In [None]:
train_data = feature_view.get_training_data(1)[0]

train_data.head()

_____
## PyTorch LSTM

---

## <span style="color:#ff5f27;"> 🧬 SequenceDataset </span>

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, data, target='aqi', sequence_length=5):
        self.features = [column for column in [*data.columns] if column != target]
        self.target = target
        self.sequence_length = sequence_length
        self.X = torch.tensor(data.drop(target, axis = 1).values.astype(np.float32)) 
        self.y = torch.tensor(data[self.target])
        
    def __repr__(self):
        return f'Features: {self.features}\nTarget: {self.target}'
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,i):
        if i < self.sequence_length:
            dist = self.sequence_length - i+1
            X = [*[self.X[0] for i in range(dist-2)],*self.X[:i+1]]
            return torch.tensor([[*obs] for obs in X]).float(),self.y[i].float()
        return self.X[i+1 - self.sequence_length:i+1].float(),self.y[i].float()

In [None]:
test_data = train_data.sort_values(by=["date", "city"], ascending=[False, True]).head(4)

In [None]:
test_data.shape

In [None]:
df_torch_test = SequenceDataset(
    data = test_data,
    target = 'aqi',
    sequence_length = 1
)

print(df_torch_test.X[1])
print(df_torch_test.y[1])

print(df_torch_test.X[0])
print(df_torch_test.y[0])

In [None]:
df_torch = SequenceDataset(
    data = train_data,
    target = 'aqi',
    sequence_length = 5
)

print(df_torch.X[0])
print(df_torch.y[0])

In [None]:
df_torch[2]

In [None]:
df_torch[5]

---

## <span style="color:#ff5f27;">🧑🏻‍🔬 DataLoader </span>

In [None]:
loader_test = DataLoader(df_torch_test,batch_size = 1)

In [None]:
loader_train = DataLoader(df_torch,batch_size = 3)

X, y = next(iter(loader_train))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

---

## <span style="color:#ff5f27;">🤖 Model Building </span>

In [None]:
class LSTMModel(nn.Module):
    def __init__(self,n_features,num_layers,hidden_size):
        super().__init__()
        self.n_features=n_features
        self.num_layers=num_layers
        self.hidden_size=hidden_size
        
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            batch_first=True,
            num_layers=self.num_layers
        )
        
        self.linear = nn.Linear(
            in_features=hidden_size,
            out_features=1
        )
        
    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_()
        
        _, (hn, _) = self.lstm(x, (h0, c0))
        out = self.linear(hn[0]).flatten()  

        return out

In [None]:
model = LSTMModel(
    n_features=len(df_torch.X[0]),
    num_layers=32,
    hidden_size=16
)

learning_rate = 0.05
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()

---

## <span style="color:#ff5f27;">👨🏻‍⚖️ Model Evaluation </span>

In [None]:
def train_model(data_loader, model, loss_function, optimizer):
    num_batches = len(data_loader)
    total_loss = .0
    model.train()

    for X, y in data_loader:
        output = model(X)

        loss = loss_function(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")

def test_model(data_loader, model, loss_function):

    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            output = model(X)
            total_loss += loss_function(output, y).item()

    avg_loss = total_loss / num_batches
    print(f"Test loss: {avg_loss}\n")

In [None]:
for epoch in range(10):
    print(f"Epoch {epoch}\n---------")
    train_model(loader_train, model, loss_function, optimizer=optimizer)

## <span style='color:#ff5f27'>👮🏼‍♀️ Model Registry</span>

In [None]:
mr = project.get_model_registry()

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(train_data.drop('aqi',axis=1))
output_schema = Schema(train_data.aqi)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

model_schema.to_dict()

In [None]:
model_dir = "./model"
torch.save(model, model_dir)

model_hops = mr.torch.create_model(
    name = "pytorch_model",
    model_schema = model_schema
)

model_hops.save(model_dir)

## <span style="color:#ff5f27;">🚀 Fetch and test the model </span>

In [None]:
model

In [None]:
model = mr.get_model("pytorch_model", version = 1)
model_dir = model.download()

model = torch.load(model_dir + '/model')
model

In [None]:
def predict(data_loader, model):

    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_star = model(X).int()
            output = torch.cat((output, y_star), 0)

    return output

In [None]:
predict(loader_test, model)

In [None]:
predict(loader_train,model)

---

## sklearn GradientBossing 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [None]:
import warnings

# Mute warnings
warnings.filterwarnings("ignore")

In [None]:
train_data.columns

In [None]:
train_data = train_data.sort_values(by=["date", 'city'], ascending=[False, True]).reset_index(drop=True)

In [None]:
train_data["aqi_next_day"] = train_data.groupby('city')['aqi'].shift(1)

In [None]:
train_data.head(5)

In [None]:
train_data.shape

In [None]:
X = train_data.drop(columns=["date"]).dropna()

In [None]:
y = X.pop("aqi_next_day")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
gb.fit(X, y)

In [None]:
X.iloc[0]

In [None]:
gb.predict(X.iloc[0].values.reshape(1, -1))

In [None]:
y.iloc[0]

In [None]:
gb.predict(X.iloc[5].values.reshape(1, -1))

In [None]:
y.iloc[5]

In [None]:
preds = gb.predict(X.head(4))

In [None]:
preds[1]

In [None]:
d = {1: 5, 2:10, 3:15}

import pandas as pd

df = pd.DataFrame(data=[15, 18], index=["kyiv", "stockholm"], columns=["AQI"], dtype=int)
df

## <span style='color:#ff5f27'>👮🏼‍♀️ Model Registry</span>

In [None]:
mr = project.get_model_registry()

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X)
output_schema = Schema(y)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

model_schema.to_dict()

In [None]:
import joblib

joblib.dump(gb, 'model.pkl')

In [None]:
model = mr.sklearn.create_model(
    name="gradient_boost_model",
    # we have very few of observations
    metrics={"f1": "0.5"},
    description="Gradient Boost Regressor.",
    input_example=X.sample(),
    model_schema=model_schema
)

model.save('model.pkl')