# **Experiment Track with using MLFlow**

In [110]:
from tkinter.scrolledtext import example

from sympy import print_tree

''' Import all important libraries '''
import os
import pandas as pd
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, random_split, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import pytorch_lightning as pl
import mlflow
from mlflow.models import infer_signature

In [111]:
Device = (torch.device('mps') if torch.mps.is_available() else torch.device('cpu'))
print(Device)

mps


In [112]:
''' Hyperparameters '''
torch.manual_seed(40)
Batch_size = 32
Epochs = 5
Learning_Rate = 0.001

In [113]:
''' Load data and define Source code path '''
Root_path = '/Users/mahadiur/Desktop/Experiment Track Using MLFlow/Data'
dataset_path = os.path.join(Root_path, 'DigitDataset.csv')

saved_model_dir = 'models'
source_code_path = os.path.join(
    os.getcwd(),
    'Experiment_Track_Using_MLFlow.ipynb'
) # current file path

source_code = 'trainer.ipynb'

# **Data Pipeline**

In [114]:
digit_data = pd.read_csv(dataset_path)
example = digit_data.iloc[0]
pixel = example.values[1:]
label = int(example.values[0:1])
print(pixel.shape)
print(label)

(784,)
1


  label = int(example.values[0:1])


In [115]:
class DataPipeline(Dataset):
    def __init__(self, data_path, transform=None):
        super().__init__()
        self.data = pd.read_csv(data_path)
        self.transformation = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data.iloc[idx]
        pixels = example.values[1:].astype('float32')
        pixels /= 255
        labels = int(example[0:1])

        pixels = torch.tensor(pixels).reshape(28, 28).unsqueeze(0)
        labels = torch.tensor(labels)


        if self.transformation:
            pixels = self.transformation(pixels)

        return  pixels, labels

In [116]:
Transformation = transforms.Compose([
    transforms.Normalize(
        mean=(torch.tensor([0.1307])),
        std=(torch.tensor([0.3081])),
    )
])

In [117]:
dataset = DataPipeline(
    dataset_path,
    Transformation
)
print(len(dataset))

42000


# **Split Train, Test & Validation**

In [118]:
Train_size = int(0.7 * len(dataset))
Validation_size = int(0.15 * len(dataset))
Test_size = len(dataset) - Train_size - Validation_size

Training_dataset, Validation_dataset, Test_dataset = random_split(
    dataset=dataset,
    lengths=[Train_size, Validation_size, Test_size],
)

print(len(Training_dataset))
print(len(Validation_dataset))
print(len(Test_dataset))

29399
6300
6301


# **Dataloader for Train, Test & Validation**

In [119]:
Train_Dataloader = DataLoader(
    dataset=Training_dataset,
    batch_size=Batch_size,
    shuffle=True,

)

Validation_Dataloader = DataLoader(
    dataset=Validation_dataset,
    batch_size=Batch_size,
    shuffle=False,
)

Test_Dataloader = DataLoader(
    dataset=Test_dataset,
    batch_size=Batch_size,
    shuffle=False,

)

In [120]:
for pixels, labels in Train_Dataloader:
    print(pixels.shape)
    print(labels.shape)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])


  labels = int(example[0:1])


# **DigitClassifiar class**

In [121]:
class DigitClass(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.criterion = nn.CrossEntropyLoss()
        self.layer1 = nn.Linear(28 * 28, 128)
        self.layer2 = nn.Linear(128, 32)
        self.layer3 = nn.Linear(32, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        return x

    # Gradient Decent
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=Learning_Rate)
        return optimizer

    # Training Step
    def training_step(self, batch, batch_idx):
        pixels, labels = batch
        pixels = pixels.to(Device)
        labels = labels.to(Device)
        outputs = self.forward(pixels)
        loss = self.criterion(outputs, labels)
        self.log('Train_loss', loss)
        return loss

    # Validation Step
    def validation_step(self, batch, batch_idx):
        pixels, labels = batch
        pixels = pixels.to(Device)
        labels = labels.to(Device)
        outputs = self.forward(pixels)
        loss = self.criterion(outputs, labels)
        accuracy = (torch.argmax(outputs, dim=1) == labels).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_accuracy', accuracy, prog_bar=True)

    # Test Step
    def test_step(self, batch, batch_idx):
        pixels, labels= batch
        pixels = pixels.to(Device)
        labels = labels.to(Device)
        outputs = self.forward(pixels)
        loss = self.criterion(outputs, labels)
        accuracy = (torch.argmax(outputs, dim=1) == labels).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_accuracy', accuracy, prog_bar=True)



In [122]:
Model = DigitClass().to(Device)

In [123]:
Early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=True,
)

checkpoints_callback = ModelCheckpoint(
    monitor='val_accuracy',
    save_top_k=1,
    mode='max',
)

checkpoints_path = os.path.join(
    os.getcwd(),'checkpoints','Best_Model.pth'
)

# **Train**

In [124]:
Training = pl.Trainer(
    max_epochs=Epochs,
    callbacks=[checkpoints_callback, Early_stopping],
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [125]:
Training.fit(
    model=Model,
    train_dataloaders=Train_Dataloader,
    val_dataloaders=Validation_Dataloader,
)


  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | criterion | CrossEntropyLoss | 0      | train
1 | layer1    | Linear           | 100 K  | train
2 | layer2    | Linear           | 4.1 K  | train
3 | layer3    | Linear           | 330    | train
4 | relu      | ReLU             | 0      | train
-------------------------------------------------------
104 K     Trainable params
0         Non-trainable params
104 K     Total params
0.420     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


                                                                            

/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
  labels = int(example[0:1])
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 919/919 [00:04<00:00, 220.48it/s, v_num=4]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/197 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/197 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/197 [00:00<00:00, 274.05it/s][A
Validation DataLoader 0:   1%|          | 2/197 [00:00<00:00, 250.79it/s][A
Validation DataLoader 0:   2%|▏         | 3/197 [00:00<00:00, 240.41it/s][A
Validation DataLoader 0:   2%|▏         | 4/197 [00:00<00:00, 238.79it/s][A
Validation DataLoader 0:   3%|▎         | 5/197 [00:00<00:00, 239.80it/s][A
Validation DataLoader 0:   3%|▎         | 6/197 [00:00<00:00, 240.01it/s][A
Validation DataLoader 0:   4%|▎         | 7/197 [00:00<00:00, 241.49it/s][A
Validation DataLoader 0:   4%|▍         | 8/197 [00:00<00:00, 234.45it/s][A
Validation DataLoader 0:   5%|▍         | 9/197 [00:00<00:00, 233.58it/s][A
Validation DataLoader 0:   5%|▌         | 10/197 [00:00<00:00, 232

Metric val_loss improved. New best score: 0.178


Epoch 1: 100%|██████████| 919/919 [00:04<00:00, 215.53it/s, v_num=4, val_loss=0.178, val_accuracy=0.945]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/197 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/197 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/197 [00:00<00:00, 354.73it/s][A
Validation DataLoader 0:   1%|          | 2/197 [00:00<00:00, 277.93it/s][A
Validation DataLoader 0:   2%|▏         | 3/197 [00:00<00:00, 264.55it/s][A
Validation DataLoader 0:   2%|▏         | 4/197 [00:00<00:00, 256.74it/s][A
Validation DataLoader 0:   3%|▎         | 5/197 [00:00<00:00, 242.86it/s][A
Validation DataLoader 0:   3%|▎         | 6/197 [00:00<00:00, 240.68it/s][A
Validation DataLoader 0:   4%|▎         | 7/197 [00:00<00:00, 231.96it/s][A
Validation DataLoader 0:   4%|▍         | 8/197 [00:00<00:00, 231.96it/s][A
Validation DataLoader 0:   5%|▍         | 9/197 [00:00<00:00, 232.20it/s][A
Validation DataLoader 0:   5%|

Metric val_loss improved by 0.030 >= min_delta = 0.0. New best score: 0.148
`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 919/919 [00:05<00:00, 177.45it/s, v_num=4, val_loss=0.148, val_accuracy=0.956]


In [126]:
best_model_path = checkpoints_callback.best_model_path
print(best_model_path)
best_model = DigitClass.load_from_checkpoint(best_model_path)

/Users/mahadiur/Desktop/Experiment Track Using MLFlow/Notebook/lightning_logs/version_4/checkpoints/epoch=1-step=1838.ckpt


In [127]:
score = Training.test(
    model=best_model,
    dataloaders=Test_Dataloader,
)

/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing DataLoader 0:  23%|██▎       | 46/197 [00:00<00:00, 297.55it/s]

  labels = int(example[0:1])


Testing DataLoader 0: 100%|██████████| 197/197 [00:00<00:00, 299.59it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.9584193229675293
        test_loss           0.13163208961486816
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


# **Use MLFlow for track experiment**

In [128]:
''' Set Experiment name '''
mlflow.set_experiment('Deep_Learning')

<Experiment: artifact_location='file:///Users/mahadiur/Desktop/Experiment%20Track%20Using%20MLFlow/Notebook/mlruns/414246106038771448', creation_time=1756301445107, experiment_id='414246106038771448', last_update_time=1756301445107, lifecycle_stage='active', name='Deep_Learning', tags={}>

In [129]:
with mlflow.start_run():
    # Save Model Hyperparameters
    mlflow.log_param('Learning Rate', Learning_Rate)
    mlflow.log_param('Epochs', Epochs)
    mlflow.log_param('Batch Size', Batch_size)

    # Training Model
    Training.fit(
        model=Model,
        train_dataloaders=Train_Dataloader,
        val_dataloaders=Validation_Dataloader,
    )

    # Save Best Model
    best_model_path = checkpoints_callback.best_model_path
    print(best_model_path)
    best_model = DigitClass.load_from_checkpoint(best_model_path)

    # Evaluate Model
    score = Training.test(
        model=best_model,
        dataloaders=Test_Dataloader,
    )

    # Save Model Test loss & Test Accuracy
    mlflow.log_metric('test_accuracy', score[0]['test_accuracy'])
    mlflow.log_metric('test_loss', score[0]['test_loss'])

    # Save Model
    pixels_batch = next(iter(Test_Dataloader))[0]
    pixels_batch = pixels_batch.cpu().numpy()

    signature = infer_signature(Model, pixels_batch)

    mlflow.pytorch.log_model(
        pytorch_model=best_model,
        artifact_path=saved_model_dir,
        input_example=pixels_batch,
        signature=signature,
    )

    import shutil
    shutil.copyfile(source_code_path,source_code)
    mlflow.log_artifact(source_code)

/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:701: Checkpoint directory /Users/mahadiur/Desktop/Experiment Track Using MLFlow/Notebook/lightning_logs/version_4/checkpoints exists and is not empty.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | criterion | CrossEntropyLoss | 0      | train
1 | layer1    | Linear           | 100 K  | train
2 | layer2    | Linear           | 4.1 K  | train
3 | layer3    | Linear           | 330    | train
4 | relu      | ReLU             | 0      | train
-------------------------------------------------------
104 K     Trainable params
0         Non-trainable params
104 K     Total params
0.420     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


                                                                            

  labels = int(example[0:1])
`Trainer.fit` stopped: `max_epochs=2` reached.


/Users/mahadiur/Desktop/Experiment Track Using MLFlow/Notebook/lightning_logs/version_4/checkpoints/epoch=1-step=1838.ckpt
Testing DataLoader 0: 100%|██████████| 197/197 [00:00<00:00, 258.26it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.9584193229675293
        test_loss           0.13163208961486816
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  "inputs": [
    [
      [
        [
          .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: setting an array element with a sequence.


In [130]:
''' Launch the experiment '''
print(f'mlflow ui --backend-store-uri {mlflow.get_tracking_uri()}')

mlflow ui --backend-store-uri file:///Users/mahadiur/Desktop/Experiment%20Track%20Using%20MLFlow/Notebook/mlruns


### **Thank You**