In [2]:
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

registry_uri="http://127.0.0.1:8090"
mlflow.set_tracking_uri(uri=registry_uri)

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from typing import List

import joblib
import os


def load_data(train_file: str, val_file: str):
    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    return train_data, val_data


def fit_encoders(train_data: pd.DataFrame, target_name: str, categorial_cols: list):
    target = train_data[target_name]
    data = train_data.drop(columns=target_name)

    one_hot_encoder = OneHotEncoder(
        sparse_output=False, drop="first", handle_unknown="ignore"
    )
    scaler = StandardScaler()
    label_encoder = LabelEncoder()

    one_hot_encoder = one_hot_encoder.fit(data[categorial_cols])
    scaler = scaler.fit(data.drop(columns=categorial_cols))
    label_encoder = label_encoder.fit(target)

    return one_hot_encoder, scaler, label_encoder


def preprocess(dataset, encoder, scaler, cat_columns: List[str]):
    dataset = dataset.copy()

    data_cat = dataset[cat_columns]
    data_nc = dataset.drop(columns=cat_columns)

    data_cat_encoded = encoder.transform(data_cat)
    data_nc_scaled = scaler.transform(data_nc)

    data_cat_encoded_df = pd.DataFrame(
        data_cat_encoded, columns=encoder.get_feature_names_out(cat_columns)
    )
    data_nc_scaled_df = pd.DataFrame(data_nc_scaled, columns=data_nc.columns)

    final_data = pd.concat([data_cat_encoded_df, data_nc_scaled_df], axis=1)

    return final_data


def save_encoders(one_hot_encoder, scaler, label_encoder, path="encoders"):
    os.makedirs(path, exist_ok=True)

    joblib.dump(one_hot_encoder, os.path.join(path, "one_hot_encoder.pkl"))
    joblib.dump(scaler, os.path.join(path, "scaler.pkl"))
    joblib.dump(label_encoder, os.path.join(path, "label_encoder.pkl"))


def load_encoders(path="encoders"):
    one_hot_encoder = joblib.load(os.path.join(path, "one_hot_encoder.pkl"))
    scaler = joblib.load(os.path.join(path, "scaler.pkl"))
    label_encoder = joblib.load(os.path.join(path, "label_encoder.pkl"))

    return one_hot_encoder, scaler, label_encoder

In [4]:
import torch
from torch import nn


class BasicNet(nn.Module):
    def __init__(self, input_size: int):
        super(BasicNet, self).__init__()

        layer = 512
        self.layers = nn.Sequential(
            nn.Linear(input_size, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, 3),
            nn.ReLU(inplace=True),
        )

        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(
                    m.weight, mode="fan_out", nonlinearity="relu"
                )
                torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                torch.nn.init.constant_(m.weight, 1)
                torch.nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.layers(x)
        return x

In [5]:
from typing import List

import mlflow
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import torch

from mlflow.pyfunc import PythonModel
from mlflow.models import set_model
import torch
from torch import nn


class ModelInferenceInterface(PythonModel):
    def __init__(
        self,
        input_size: int,
        one_hot_encoder: OneHotEncoder | None = None,
        scaler: StandardScaler | None = None,
        label_encoder: LabelEncoder | None = None,
        cat_columns: List[str] | None = None,
    ) -> None:
        super().__init__()
        self.one_hot_encoder = one_hot_encoder
        self.scaler = scaler
        self.label_encoder = label_encoder
        self.cat_columns = cat_columns
        self.model = BasicNet(input_size=input_size)

    def __process(self, data) -> pd.DataFrame:
        if (
            self.one_hot_encoder is not None
            and self.scaler is not None
            and self.cat_columns is not None
        ):
            return preprocess(
                dataset=data,
                encoder=self.one_hot_encoder,
                scaler=self.scaler,
                cat_columns=self.cat_columns,
            )
        else:
            return data

    def predict(self, X: pd.DataFrame):
        data = self.__process(X)
        with torch.no_grad():
            self.model.eval()
            outputs = self.model(torch.tensor(data.values, dtype=torch.float32))
            predictions = outputs.argmax(dim=1).numpy()

        return self.label_encoder.inverse_transform(predictions)

    def load_state_dict(self, state_dict):
        self.model.load_state_dict(state_dict)


set_model(ModelInferenceInterface(input_size=271))

In [6]:
import torch
from torch import nn
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

from typing import List
import os
# import mlflow
# import mlflow.pytorch
# from mlflow.models import infer_signature

# registry_uri = "127.0.0.1:8090"
# mlflow.set_tracking_uri(uri=registry_uri)


def train(
    model,
    optimizer,
    loss_fn,
    train_loader,
    val_loader,
    epochs=1,
    device="cpu",
    scheduler=None,
    tolerance=-1,
    tolerance_delta=1e-4,
):
    best = 0.0

    not_improving = 0
    last_loss = None

    for epoch in range(epochs):
        train_loop = tqdm(
            enumerate(train_loader, 0), total=len(train_loader), desc=f"Epoch {epoch}"
        )
        model.train()
        train_loss = 0.0

        for data in train_loop:
            data = data[1]
            inputs, labels = data[0].to(device), data[1].to(device)

            optimizer.zero_grad()

            output = model(inputs)

            loss = loss_fn(output, labels)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            train_loop.set_postfix({"loss": loss.item()})

            if scheduler:
                scheduler.step()

        mlflow.log_metric("loss", f"{train_loss / len(train_loader):6f}", step=epoch)

        correct = 0
        total = 0

        with torch.no_grad():
            model.eval()
            val_loop = tqdm(enumerate(val_loader, 0), total=len(val_loader), desc="Val")
            for data in val_loop:
                data = data[1]
                inputs, labels = data[0].to(device), data[1].to(device)

                output = model(inputs)
                loss = loss_fn(output, labels).item()

                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(labels.view_as(pred)).sum().item()
                total += len(labels)

                val_loop.set_postfix({"acc": correct / total})

            val_acc = correct / total
            mlflow.log_metric(
                "validation loss", f"{loss / len(val_loader):6f}", step=epoch
            )
            mlflow.log_metric("validation accuracy", f"{val_acc:2f}", step=epoch)
            if val_acc > best:
                torch.save(model.parameters, "model_best.pt")
                torch.save(optimizer.state_dict(), "opimizer.pt")
                best = correct / total
        if epoch != 0:
            if abs(train_loss - last_loss) < tolerance_delta:
                not_improving += 1
                if not_improving == tolerance:
                    print("Stop due to early reaching tolerance_delta")
                    break
            else:
                not_improving = 0
        last_loss = train_loss

    print(best)


def mlflow_training(
    train_file: str = "../data/processed/train_data.csv",
    val_file: str = "../data/processed/val_data.csv",
    test_file: str = "../data/processed/test_data.csv",
    target_name: str = "Target",
    model_name: str = "BasicModel",
    model_alias: str = "Champion",
    registry_uri: str = "127.0.0.1:8000",
    batch_size: int = 32,
    epochs: int = 5,
    categorical_cols: List | None = None,
):
    for file_path in [train_file, val_file]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

    with mlflow.start_run() as run:
        train_data_, val_data_ = load_data(train_file, val_file)

        one_hot_encoder, scaler, label_encoder = fit_encoders(
            train_data_, target_name, categorical_cols
        )
        save_encoders(one_hot_encoder, scaler, label_encoder)

        # transofrms
        train_target = train_data_[target_name]
        val_target = val_data_[target_name]

        train_data = train_data_.drop(columns=[target_name])
        val_data = val_data_.drop(columns=[target_name])

        train_features = preprocess(
            train_data, one_hot_encoder, scaler, cat_columns=categorical_cols
        )
        train_target_encoded = label_encoder.transform(train_target)

        val_features = preprocess(
            val_data, one_hot_encoder, scaler, cat_columns=categorical_cols
        )

        val_target_encoded = label_encoder.transform(val_target)

        # dataloaders
        train_dataset = TensorDataset(
            torch.tensor(train_features.values, dtype=torch.float32),
            torch.tensor(train_target_encoded, dtype=torch.int64),
        )
        val_dataset = TensorDataset(
            torch.tensor(val_features.values, dtype=torch.float32),
            torch.tensor(val_target_encoded, dtype=torch.int64),
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # model init + training
        input_size = train_features.values.shape[1]
        print(input_size)
        model = BasicNet(input_size=input_size)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        loss_function = nn.CrossEntropyLoss()

        train(
            model,
            optimizer,
            loss_function,
            train_loader,
            val_loader,
            device="cpu",
            epochs=epochs,
        )

        model_with_inference = ModelInferenceInterface(
            input_size,
            one_hot_encoder,
            scaler,
            label_encoder,
            cat_columns=categorical_cols,
        )
        model_with_inference.load_state_dict(model.state_dict())

        signature = infer_signature(
            train_data,
            model_with_inference.predict(train_data),
        )
        model_info = mlflow.pyfunc.log_model(
            artifact_path="models",
            python_model=model_with_inference,
            signature=signature,
            registered_model_name=model_name,
        )

        client = mlflow.MlflowClient(registry_uri=registry_uri)
        client.set_registered_model_alias(
            model_name, model_alias, model_info.registered_model_version
        )


mlflow_training(
    registry_uri=registry_uri,
    epochs=1,
    categorical_cols=[
        "Marital status",
        "Application mode",
        "Application order",
        "Course",
        "Daytime/evening attendance",
        "Previous qualification",
        "Nacionality",
        "Mother's qualification",
        "Father's qualification",
        "Mother's occupation",
        "Father's occupation",
        "Displaced",
        "Educational special needs",
        "Debtor",
        "Tuition fees up to date",
        "Gender",
        "Scholarship holder",
    ],
)



271


Epoch 0: 100%|██████████| 1913/1913 [00:10<00:00, 181.05it/s, loss=0.871]
Val: 100%|██████████| 479/479 [00:00<00:00, 561.86it/s, acc=0.443]


0.4427236489577207


Registered model 'BasicModel' already exists. Creating a new version of this model...
Created version '2' of model 'BasicModel'.


NameError: name 'train_data' is not defined

In [7]:
# def mlflow_training():
#     device = "mps"

#     epochs = 1

#     tolerance = 7
#     tolerance_delta = 1e-4

#     model_name = "basic_model"
#     model_alias = "latest"

#     client = mlflow.MlflowClient()
#     mlflow.set_tracking_uri(uri="http://127.0.0.1:8090")

#     overfit_single_batch(
#         model=BasicNet(),
#         loss_fn=nn.CrossEntropyLoss(),
#         train_loader=train_loader,
#         device="mps",
#         epochs=5,
#         batch_size=1,
#         lr=1e-2,
#     )
#     client = mlflow.MlflowClient()
#     with mlflow.start_run() as run:
#         train(
#             model=model,
#             optimizer=optimizer,
#             loss_fn=loss_function,
#             train_loader=train_loader,
#             val_loader=val_loader,
#             device=device,
#             epochs=epochs,
#             tolerance=tolerance,
#             tolerance_delta=tolerance_delta,
#         )
#         data_iter = iter(train_loader)
#         inputs, labels = next(data_iter)
#         signature = infer_signature(
#             inputs.numpy(), model(inputs.to(device)).detach().cpu().numpy()
#         )
#         model_info = mlflow.pytorch.log_model(
#             pytorch_model=model,
#             artifact_path="models",
#             signature=signature,
#             input_example=inputs.numpy(),
#             registered_model_name=model_name,
#         )
#         client.set_registered_model_alias(
#             model_name, model_alias, model_info.registered_model_version
#         )
#         model_ver = client.get_model_version_by_alias(model_name, model_alias)
#         torch_model = mlflow.pytorch.load_model(f"models:/{model_name}@{model_alias}")
#         torch_model.to(device)

#         val_iter = iter(val_loader)
#         X_val, y_val = next(val_iter)

#         X_val = X_val.to(device)
#         y_val = y_val.to(device)

#         raw_predictions = torch_model(X_val)
#         pred = raw_predictions.argmax(dim=1, keepdim=True)
#         predictions = pred.eq(y_val.view_as(pred)).sum().item()

#         print(predictions)

#         eval_data = pd.DataFrame(X_val.cpu())
#         eval_data["label"] = y_val.cpu()
#         eval_data["predictions"] = predictions
#         print(eval_data.shape)

#         results = mlflow.evaluate(
#             data=eval_data,
#             model_type="classifier",
#             targets="label",
#             predictions="predictions",
#             evaluators=["default"],
#         )