# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping


# Hyperparameters

In [2]:
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.003

# Set Random Seed and Find Device

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x1f533d3ef30>

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Read Dataset

In [5]:
dataset_file = "../Datasets/digit_train.csv"
data = pd.read_csv(dataset_file)
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Dataset Pipeline

In [6]:
class DigitDataset(Dataset):
    def __init__(self, file_path, transform):
        self.data = pd.read_csv(file_path)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.iloc[idx]
        pixels = example.values[1:].astype('float32')
        pixels = pixels / 255.0
        label = int(example.values[0])
        
        pixels = torch.tensor(pixels).reshape(28, 28).unsqueeze(0)
        label = torch.tensor(label)
        
        if self.transform:
            pixels = self.transform(pixels)
            
        return pixels, label

# Data Transformation

In [7]:
pixel_transformation = transforms.Compose([
    transforms.Normalize(
        mean = torch.tensor([0.1307]),
        std = torch.tensor([0.3081])
    )
])

# Initiate The dataset pipeline

In [8]:
dataset = DigitDataset(
    file_path = dataset_file,
    transform = pixel_transformation
)

# Train, Test, Validation Set

In [9]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset=dataset,
    lengths=[train_size, val_size, test_size],
)

# Data Loader

In [10]:
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [11]:
for pixels, label in train_loader:
    print(pixels.shape)
    print(label.shape)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])


# Model Build

In [12]:
class DigitClassifier(pl.LightningModule):
    def __init__(self):
        super(DigitClassifier, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        return x
    
    def training_step(self, batch, batch_idx):
        pixels, labels = batch
        output = self(pixels)
        loss = nn.CrossEntropyLoss()(output, labels)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        pixels, labels = batch
        output = self(pixels)
        loss = nn.CrossEntropyLoss()(output, labels)
        acc = (output.argmax(dim=1) == labels).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_accuracy', acc, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        pixels, labels = batch
        output = self(pixels)
        loss = nn.CrossEntropyLoss()(output, labels)
        acc = (output.argmax(dim=1) == labels).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_accuracy', acc, prog_bar=True)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        return optimizer

In [13]:
model = DigitClassifier()

# Callbacks

In [14]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_accuracy',
    save_top_k=1,
    mode='max'
)

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=2,
    verbose=True,
)

In [15]:
import os
checkpoint_path = os.path.join(
    os.getcwd(), "../Model/", "best_model.pth"
)

# Train

In [16]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    callbacks=[checkpoint_callback, early_stopping],
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


# Track Experiments

In [17]:
import mlflow
import shutil
from mlflow.models.signature import infer_signature

mlflow.set_experiment(experiment_name="Digit_Classifier") 

2025/01/29 14:02:33 INFO mlflow.tracking.fluent: Experiment with name 'Digit_Classifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///f:/bongoDev%20ML%20Course/000.Exercises/BongoDev%20ML%20Course%20Practise/MLflow/mlruns/900972381690289215', creation_time=1738137753098, experiment_id='900972381690289215', last_update_time=1738137753098, lifecycle_stage='active', name='Digit_Classifier', tags={}>

In [18]:
ARTIFACT_FOLDER_NAME = "model"
SOURCE_CODE_PATH = os.path.join(
        os.getcwd(),
        "malflow.ipynb",
)
SOURCE_CODE_ARTIFACT = "trainer.ipynb"

In [None]:
with mlflow.start_run():
    # log hyperparameters
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("lr", LEARNING_RATE)
    mlflow.log_param("batch_size", BATCH_SIZE)
    
    # train the model
    trainer.fit(
        model=model, 
        train_dataloaders=train_loader, 
        val_dataloaders=val_loader
    )
    
    # get the best model
    best_model_path = checkpoint_callback.best_model_path
    best_model = DigitClassifier.load_from_checkpoint(best_model_path)
    
    # evaluate the model on test set
    evaluation_score = trainer.test(
        best_model,
        dataloaders=test_loader
    )
    
    # log the model/results
    mlflow.log_metric("test_accuracy", evaluation_score[0]['test_accuracy'])
    mlflow.log_metric("test_loss", evaluation_score[0]['test_loss'])
    
    # save the model
    pixels_batch = next(iter(test_loader))[0]
    pixels_batch = pixels_batch.cpu().numpy()
    signature = infer_signature(pixels_batch)
    
    mlflow.pytorch.log_model(
        pytorch_model=best_model,
        artifact_path="model",# where the model is saved
        registered_model_name="digit_classifier",
        signature=signature
    )
    
    # log the source code
    shutil.copyfile(SOURCE_CODE_PATH, SOURCE_CODE_ARTIFACT)
    mlflow.log_artifact(SOURCE_CODE_ARTIFACT)
    


  | Name | Type   | Params | Mode 
----------------------------------------
0 | fc1  | Linear | 100 K  | train
1 | fc2  | Linear | 8.3 K  | train
2 | fc3  | Linear | 650    | train
----------------------------------------
109 K     Trainable params
0         Non-trainable params
109 K     Total params
0.438     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.200


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.044 >= min_delta = 0.0. New best score: 0.156


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.028 >= min_delta = 0.0. New best score: 0.127


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.125
`Trainer.fit` stopped: `max_epochs=5` reached.
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]