[Dataset](https://www.kaggle.com/competitions/playground-series-s4e6/overview), accuracy metric

[Dataset description](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success)

In [54]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from torch import nn
from tqdm.notebook import tqdm

In [55]:
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://127.0.0.1:8000")

In [56]:
df = pd.read_csv("../code/datasets/train.csv", index_col=0)
target = df["Target"].values
data = df.drop(columns=["Target"])

In [57]:
categorial_cols = [
    "Marital status",
    "Application mode",
    "Application order",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
]

In [58]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler


def preprocess(dataset, encoder, scaler, cols):
    dataset = dataset.copy()

    data_cat = dataset[categorial_cols]
    data_nc = dataset.drop(columns=categorial_cols)

    data_cat_encoded = encoder.fit_transform(data_cat)
    data_nc_scaled = scaler.fit_transform(data_nc)

    data_cat_encoded_df = pd.DataFrame(
        data_cat_encoded, columns=encoder.get_feature_names_out(cols)
    )
    data_nc_scaled_df = pd.DataFrame(data_nc_scaled, columns=data_nc.columns)

    final_data = pd.concat([data_cat_encoded_df, data_nc_scaled_df], axis=1)

    return final_data

In [None]:
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

one_hot_encoder = OneHotEncoder(
    sparse_output=False, drop="first", handle_unknown="ignore"
)
scaler = StandardScaler()

data_prec = preprocess(
    dataset=data, encoder=one_hot_encoder, scaler=scaler, cols=categorial_cols
)
data_prec

In [60]:
def split_data(data, test_size=0.2, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(len(data))
    test_split = int(len(data) * test_size)

    train_indices, test_indices = indices[test_split:], indices[:test_split]
    return train_indices, test_indices

In [61]:
batch_size = 128
test_size = 0.2
seed = 42

In [62]:
from torch.utils.data import DataLoader, SubsetRandomSampler, TensorDataset

X = torch.tensor(data_prec.values, dtype=torch.float32)
y = torch.tensor(target_encoded, dtype=torch.int64)

processed_dataset = TensorDataset(X, y)

train_idx, test_idx = split_data(data=X, test_size=test_size, seed=seed)

train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(test_idx)

train_loader = DataLoader(
    processed_dataset, batch_size=batch_size, sampler=train_sampler
)
val_loader = DataLoader(processed_dataset, batch_size=batch_size, sampler=val_sampler)

In [None]:
X.shape

In [64]:
class BasicNet(nn.Module):
    def __init__(self):
        super(BasicNet, self).__init__()

        layer = 512
        self.layers = nn.Sequential(
            nn.Linear(280, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, layer),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.BatchNorm1d(layer),
            nn.Linear(layer, 3),
            nn.ReLU(inplace=True),
        )

        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(
                    m.weight, mode="fan_out", nonlinearity="relu"
                )
                torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                torch.nn.init.constant_(m.weight, 1)
                torch.nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.layers(x)
        return x

In [65]:
def train(
    model,
    optimizer,
    loss_fn,
    train_loader,
    val_loader,
    writer=None,
    epochs=1,
    device="cpu",
    model_path="best.pt",
    scheduler=None,
    tolerance=-1,
    tolerance_delta=1e-4,
):
    best = 0.0

    not_improving = 0
    last_loss = None

    # iterating over epochs
    for epoch in range(epochs):
        # training loop description
        train_loop = tqdm(
            enumerate(train_loader, 0), total=len(train_loader), desc=f"Epoch {epoch}"
        )
        model.train()
        train_loss = 0.0

        # iterate over dataset
        for data in train_loop:
            # Write your code here
            # Move data to a device, do forward pass and loss calculation, do backward pass and run optimizer
            data = data[1]
            inputs, labels = data[0].to(device), data[1].to(device)

            optimizer.zero_grad()

            output = model(inputs)

            loss = loss_fn(output, labels)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            train_loop.set_postfix({"loss": loss.item()})

            if scheduler:
                scheduler.step()

        mlflow.log_metric("loss", f"{train_loss / len(train_loader):6f}", step=epoch)

        # Validation
        correct = 0
        total = 0

        with torch.no_grad():
            model.eval()  # evaluation mode
            val_loop = tqdm(enumerate(val_loader, 0), total=len(val_loader), desc="Val")
            for data in val_loop:
                data = data[1]
                inputs, labels = data[0].to(device), data[1].to(device)
                # Write your code here
                output = model(inputs)
                loss = loss_fn(output, labels).item()

                # Get predictions and compare them with labels
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(labels.view_as(pred)).sum().item()
                total += len(labels)

                val_loop.set_postfix({"acc": correct / total})

            val_acc = correct / total
            mlflow.log_metric(
                "validation loss", f"{loss / len(val_loader):6f}", step=epoch
            )
            mlflow.log_metric("validation accuracy", f"{val_acc:2f}", step=epoch)
            if val_acc > best:
                torch.save(model.parameters, "model_best.pt")
                torch.save(optimizer.state_dict(), "opimizer.pt")
                best = correct / total
        if epoch != 0:
            if abs(train_loss - last_loss) < tolerance_delta:
                not_improving += 1
                if not_improving == tolerance:
                    print("Stop due to early reaching tolerance_delta")
                    break
            else:
                not_improving = 0
        last_loss = train_loss

    print(best)

In [None]:
def overfit_single_batch(
    train_loader,
    model,
    loss_fn,
    device="cpu",
    epochs=1,
    batch_size=1,
    lr=1e-2,
):
    data_iter = iter(train_loader)
    single_batch = next(data_iter)
    inputs, labels = single_batch[0].to(device), single_batch[1].to(device)

    single_batch_loader = DataLoader(
        list(zip(inputs, labels)), batch_size=batch_size, shuffle=True
    )

    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.train()

    for epoch in range(epochs):
        train_loop = tqdm(
            enumerate(single_batch_loader, 0),
            total=len(single_batch_loader),
            desc=f"Epoch {epoch}",
        )
        train_loss = 0.0
        for i, (batch_inputs, batch_labels) in train_loop:
            optimizer.zero_grad()

            output = model(inputs)
            loss = loss_fn(output, labels)

            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            train_loop.set_postfix({"loss": loss.item()})

        print(f"Epoch {epoch} Training Loss: {train_loss / len(single_batch_loader)}")


overfit_single_batch(
    model=BasicNet(),
    loss_fn=nn.CrossEntropyLoss(),
    train_loader=train_loader,
    device="mps",
    epochs=5,
    batch_size=1,
    lr=1e-2,
)

In [67]:
device = "cpu"

epochs = 1

tolerance = 7
tolerance_delta = 1e-4

model_name = "basic_model"
model_alias = "production"

In [68]:
# import mlflow.models.container


# C = mlflow.models.container
# C._install_pyfunc_deps

In [69]:
# export MLFLOW_TRACKING_URI=http://localhost:8090
# mlflow models generate-dockerfile --model-uri "models:/basic_model@production" --env-manager virtualenv -d mlflow_api --install-mlflow

# docker build -t mlflow_service mlflow_api
# docker run --rm -p 5152:8010 mlflow_service

# mlflow server -h localhost -p 8090

In [70]:
model = BasicNet()
model.to(device=device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adadelta(model.parameters(), lr=1)

In [None]:
client = mlflow.MlflowClient(registry_uri="127.0.0.1:8000")

with mlflow.start_run() as run:
    train(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_function,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        epochs=epochs,
        tolerance=tolerance,
        tolerance_delta=tolerance_delta,
    )

    data_iter = iter(train_loader)
    inputs, labels = next(data_iter)

    signature = infer_signature(
        inputs.numpy(), model(inputs.to(device)).detach().cpu().numpy()
    )
    model_info = mlflow.pytorch.log_model(
        pytorch_model=model,
        artifact_path="models",
        signature=signature,
        input_example=inputs.numpy(),
        registered_model_name=model_name,
    )

    client.set_registered_model_alias(
        model_name, model_alias, model_info.registered_model_version
    )

    model_ver = client.get_model_version_by_alias(model_name, model_alias)
    torch_model = mlflow.pytorch.load_model(f"models:/{model_name}@{model_alias}")
    torch_model.to(device)

    # Change to calling validation function for whole validation dataset
    val_iter = iter(val_loader)
    X_val, y_val = next(val_iter)

    X_val = X_val.to(device)
    y_val = y_val.to(device)

    raw_predictions = torch_model(X_val)
    pred = raw_predictions.argmax(dim=1, keepdim=True)
    predictions = pred.eq(y_val.view_as(pred)).sum().item()

    print(predictions)

    eval_data = pd.DataFrame(X_val.cpu())
    eval_data["Target"] = y_val.cpu()
    eval_data["predictions"] = predictions
    print(eval_data.shape)

    results = mlflow.evaluate(
        data=eval_data,
        model_type="classifier",
        targets="Target",
        predictions="predictions",
        evaluators=["default"],
    )

print(f"metrics:\n{results.metrics}")
print(f"artifacts:\n{results.artifacts}")

In [79]:
val_iter = iter(val_loader)
X_val, y_val = next(val_iter)

In [95]:
X_val[0].shape

torch.Size([280])

In [104]:
import requests
import json
import torch

MLFLOW_URL = "http://127.0.0.1:8080"
val_iter = iter(val_loader)
X_val, y_val = next(val_iter)

In [113]:
{"inputs": [X_val[0].tolist()]}

{'inputs': [[0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0

In [112]:


input_payload = {"inputs": [X_val[0].tolist()]}

try:
    response = requests.post(
        f"{MLFLOW_URL}/invocations",
        json=input_payload  # Correctly pass JSON payload
    )

    # Check if the request was successful (HTTP 200)
    if response.status_code == 200:
        # Parse the JSON response from the server
        prediction = response.json()
        print("Prediction:", prediction)
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)

except Exception as e:
    print(f"An error occurred while making the request: {e}")

Prediction: {'predictions': [[0.0, 0.0, 0.0]]}


In [None]:
try:
    # Check if the request was successful
    if response.status_code == 200:
        prediction = response.json()
        print("Prediction:", prediction)
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)

except Exception as e:
    print(f"An error occurred while making the request: {e}")

In [None]:
v = client.get_model_version_by_alias(model_name, model_alias)
pytorch_pyfunc = mlflow.pyfunc.load_model(f"models:/{model_name}@{model_alias}")

In [None]:
v = client.get_model_version_by_alias(model_name, model_alias)
model = mlflow.pytorch.load_model(f"models:/{model_name}@{model_alias}")

In [None]:
model

In [49]:
# from mlflow import MlflowClient

# client = mlflow.MlflowClient()

# # create "champion" alias for version 1 of model "example-model"
# client.set_registered_model_alias("example-model", "champion", 1)

# # reassign the "Champion" alias to version 2
# client.set_registered_model_alias("example-model", "Champion", 2)

# # get a model version by alias
# client.get_model_version_by_alias("example-model", "Champion")

# # delete the alias
# client.delete_registered_model_alias("example-model", "Champion")


# import mlflow.pyfunc

# model_name = "sk-learn-random-forest-reg-model"
# model_version = 1

# model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

# model.predict(data)


# import mlflow.pyfunc

# model_name = "sk-learn-random-forest-reg-model"
# alias = "champion"

# champion_version = mlflow.pyfunc.load_model(f"models:/{model_name}@{alias}")

# champion_version.predict(data)

In [None]:
import mlflow
import mlflow.onnx
import numpy as np
import onnx
import pandas as pd
import torch
from mlflow.models import infer_signature

with mlflow.start_run() as run:
    model_path = "model.onnx"

    train(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_function,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        epochs=epochs,
        tolerance=tolerance,
        tolerance_delta=tolerance_delta,
        model_path=model_path,
    )

    data_iter = iter(train_loader)
    inputs, labels = next(data_iter)

    onnx_model = onnx.load(model_path)
    signature = infer_signature(
        inputs.numpy(), model(inputs.to(device)).detach().cpu().numpy()
    )
    model_info = mlflow.onnx.log_model(
        onnx_model=onnx_model,
        artifact_path="onnx_model",
        signature=signature,
        input_example=inputs.numpy(),
        registered_model_name="onnx_model",
    )

    onnx_pyfunc = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

    val_iter = iter(val_loader)
    X_val, y_val = next(val_iter)

    raw_predictions = onnx_pyfunc.predict(X_val.numpy())
    predictions = np.argmax(raw_predictions, axis=1)

    print(predictions)

    eval_data = pd.DataFrame(X_val.numpy())
    eval_data["label"] = y_val.numpy()
    eval_data["predictions"] = predictions
    print(eval_data.shape)

    results = mlflow.evaluate(
        data=eval_data,
        model_type="classifier",
        targets="label",
        predictions="predictions",
        evaluators=["default"],
    )

print(f"metrics:\n{results.metrics}")
print(f"artifacts:\n{results.artifacts}")