In [139]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import pandas as pd
import os
import torch.nn as nn
import pytorch_lightning as pl#used as the replacement of nn.module
from mlflow.models import infer_signature
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [140]:
ROOT_DIR = r"C:\Users\User\Downloads\60 days of python\day-36(Experiment tracking using MLFlow"
DATA_DIR = os.path.join(ROOT_DIR, "data")
dataset_file = os.path.join(DATA_DIR, "digit_train.csv")

ARTIFACT_FOLDER_NAME = "model" # Directory to save models
SOURCE_CODE_PATH = os.path.join(
        os.getcwd(),#getcwd means get current woriking directory
        "day-36.ipynb",
) # Our current notebook file path means jei notebook e amra kaj korbo
print(SOURCE_CODE_PATH)
SOURCE_CODE_ARTIFACT = "trainer.ipynb" # Filename to save our source code means day-36.ipynb notebook e kar korar por file ta trainer.ipynb file hisebe save korbo

C:\Users\User\Downloads\60 days of python\day-36(Experiment tracking using MLFlow\day-36.ipynb


In [141]:
""" Hyperparameters: Parameters that are not for neural networks but use to train
models. """
EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 0.001

In [142]:
""" Seed """
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# **Data Pipeline**

In [143]:
class DigitDataset(Dataset):
    def __init__(self, file_path, transform):
        self.data = pd.read_csv(file_path)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """ Get a sample from the dataset. """
        example = self.data.iloc[idx]
        pixels = example.values[1:].astype('float32')
        pixels /= 255.0
        label = int(example.values[0])

        """
        pixels: B x C x H x W
        """
        pixels = torch.tensor(pixels).reshape(28, 28).unsqueeze(0)
        label = torch.tensor(label)

        if self.transform:
            pixels = self.transform(pixels)

        return pixels, label

In [144]:
pixel_transformation = transforms.Compose([
    transforms.Normalize(
        mean=torch.Tensor([0.1307]),
        std=torch.Tensor([0.3081])
    )
])

In [145]:
dataset = DigitDataset(
    file_path=dataset_file,
    transform=pixel_transformation
)

In [146]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset=dataset,
    lengths=[train_size, val_size, test_size]
)

In [147]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)#shuffle means The dataset will be randomly shuffled at the start of every epoch.
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [148]:
for pixels, labels in train_loader:
    print(pixels.shape)
    print(labels.shape)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])


# **Model**

In [149]:
# Change 1: Replaced nn.Module with pl.Lightning module
#day-36 e only model ta k pytorch lightning e convert kora hoise to reduce complexity of codes..baki sob kisu same ager moto
class DigitClassifier(pl.LightningModule):
    def __init__(self):
        super(DigitClassifier, self).__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        return x

    def configure_optimizers(self):#ekhane jusst functionta bole dewa hoise. baki kaj like back propagation weight update egulo pytorch lighning nije handle kore ney
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        return optimizer

    def training_step(self, batch, batch_idx):
        pixels, labels = batch
        pixels = pixels.to(device)
        labels = labels.to(device)

        output = self.forward(pixels)
        loss = self.loss_fn(output, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        pixels, labels = batch
        pixels = pixels.to(device)
        labels = labels.to(device)
        output = self.forward(pixels)
        loss = self.loss_fn(output, labels)
        acc = (output.argmax(dim=1) == labels).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_accuracy', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        pixels, labels = batch
        pixels = pixels.to(device)
        labels = labels.to(device)
        output = self.forward(pixels)
        loss = self.loss_fn(output, labels)
        acc = (output.argmax(dim=1) == labels).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_accuracy', acc, prog_bar=True)

In [150]:
model = DigitClassifier()

In [151]:
early_stopping=EarlyStopping(
    monitor='val_loss',#validation loss ta monitor korbo j etar loss er kono improvement hocche kina .Should match with the validation step log key
    patience=2,# means ami koto epoch porjonto check korbo improvement hocche kina
    verbose=True,#2 ta epoch e jodi validation loss improve na kore tahole massage dewar jonno bolbe verbose
)
checkpoint_callback=ModelCheckpoint(
    monitor='val_accuracy',#Should match with the validation step log key
    save_top_k=1,#saves best 1 models
    mode='max',#max means maximum validation accuracy
)
checkpoint_path=os.path.join(
    os.getcwd(),"checkpoints","best_model.pth"
)

In [152]:
print(checkpoint_path)

C:\Users\User\Downloads\60 days of python\day-36(Experiment tracking using MLFlow\checkpoints\best_model.pth


# **Train**

In [153]:
trainer=pl.Trainer(
    max_epochs=EPOCHS,
    callbacks=[checkpoint_callback, early_stopping]#Runs these checkpoints after each epochs by default
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [154]:
# trainer.fit(
#     model=model,
#     train_dataloaders=train_loader,
#     val_dataloaders=val_loader
# )

In [155]:
# trainer.test(
#     model=model,
#     dataloaders=test_loader
#
# )

In [156]:
best_model_path=checkpoint_callback.best_model_path
print(best_model_path)




In [157]:
"""ans ta bujacche j epcoh 2 er step number(batch number) 1380 er jonno j model ta train hoise setai chilo sobcheye best model"""

'ans ta bujacche j epcoh 2 er step number(batch number) 1380 er jonno j model ta train hoise setai chilo sobcheye best model'

In [158]:
# """check point er path theke ekhn amra sohojei model ta k load krte pari """
# best_model=DigitClassifier.load_from_checkpoint(best_model_path)#ei load_from_checkpoint method ta kintu amra create korinai.eta mainly paisi LightningModule theke


In [159]:
score=trainer.test(
    model=best_model,
    dataloaders=test_loader

)
print(type(score))
print(score)


C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing: |                                                                                                    …

<class 'list'>
[{'test_loss': 0.1524478644132614, 'test_accuracy': 0.953499436378479}]


# **Experiment Tracking with MlFlow**

In [162]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.restore_experiment("Saikats MLOPs experiment")


MlflowException: Could not find deleted experiment with ID Saikats MLOPs experiment

In [165]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

client = MlflowClient()

experiments = client.search_experiments(view_type=ViewType.ALL)
for e in experiments:
    print(e.experiment_id, e.name, e.lifecycle_stage)


347303341436223307 Saikats MLOPs experiment deleted
561419725598425326 Saikats MLOPs active
0 Default active


In [161]:
import mlflow
mlflow.set_experiment(experiment_name="Saikats MLOPs experiment")

MlflowException: Cannot set a deleted experiment 'Saikats MLOPs experiment' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.

In [135]:
"""What is artifact?
Think of artifacts as saved results of your training—things you want to keep.
Artifact = Any file created during ML experiments (models, plots, metrics, datasets, logs)."""

'What is artifact?\nThink of artifacts as saved results of your training—things you want to keep.\nArtifact = Any file created during ML experiments (models, plots, metrics, datasets, logs).'

In [136]:
with mlflow.start_run():#mlflow er prottekta experiment k bola hoy run.

    # Log Hyperparameters
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("epochs", EPOCHS)
    # Train the model
    trainer.fit(
        model=model,
        train_dataloaders=train_loader,
        val_dataloaders=val_loader
    )

    # Get the best model
    best_model_path = checkpoint_callback.best_model_path
    best_model = DigitClassifier.load_from_checkpoint(best_model_path)

    # Evaluate the model on the test set
    evaluation_score = trainer.test(
        best_model,
        dataloaders=test_loader,
    )
    # print(evaluation_score) output:[{'test_loss': 0.14488188922405243, 'test_accuracy': 0.9560387134552002}]
    mlflow.log_metric("test_accuracy", evaluation_score[0]["test_accuracy"])#evave evaluation gulo log akare save kore rakhte pari
    mlflow.log_metric("test_loss", evaluation_score[0]["test_loss"])
    pixels_batch = next(iter(test_loader))[0]
    pixels_batch = pixels_batch.cpu().numpy()
    signature = infer_signature(model, pixels_batch)
    mlflow.pytorch.log_model(
        pytorch_model=best_model,
        artifact_path=ARTIFACT_FOLDER_NAME,
        input_example=pixels_batch,
        signature=signature
    )

    # Log the source code
    import shutil
    shutil.copyfile(SOURCE_CODE_PATH, SOURCE_CODE_ARTIFACT)#lastly amader source code ta lage j kon code tar karone ei change ta hoise
    mlflow.log_artifact(SOURCE_CODE_ARTIFACT)


  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | loss_fn | CrossEntropyLoss | 0      | train
1 | fc1     | Linear           | 100 K  | train
2 | fc2     | Linear           | 8.3 K  | train
3 | fc3     | Linear           | 650    | train
-----------------------------------------------------
109 K     Trainable params
0         Non-trainable params
109 K     Total params
0.438     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                                                            …

C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Metric val_loss improved. New best score: 0.195


Validation: |                                                                                                 …

Metric val_loss improved by 0.076 >= min_delta = 0.0. New best score: 0.120


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |                                                                                                    …



Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  "inputs": [
    [
      [
        [
          .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: setting an array element with a sequence.


In [138]:
print(f"mlflow ui --backend-store-uri {mlflow.get_tracking_uri()}")

mlflow ui --backend-store-uri file:///C:/Users/User/Downloads/60%20days%20of%20python/day-36%28Experiment%20tracking%20using%20MLFlow/mlruns
