<a href="https://colab.research.google.com/github/Imrantipu/-serviceReviewServer/blob/main/cnn_fashion_mnist_gpu_pytorch_profiling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [141]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.profiler import profile, record_function, ProfilerActivity

In [142]:
# Set random seeds for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7b61fab26550>

In [143]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [144]:
# Load dataset
try:
    df = pd.read_csv("fmnist_small.csv")
    print(df.head())
except Exception as e:
    print(f"Error reading CSV: {e}")

   label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0      9       0       0       0       0       0       0       0       0   
1      7       0       0       0       0       0       0       0       0   
2      0       0       0       0       0       0       1       0       0   
3      8       0       0       0       0       0       0       0       0   
4      8       0       0       0       0       0       0       0       0   

   pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0       0  ...         0         7         0        50       205       196   
1       0  ...         0         0         0         0         0         0   
2       0  ...       142       142       142        21         0         3   
3       0  ...         0         0         0         0         0         0   
4       0  ...       213       203       174       151       188        10   

   pixel781  pixel782  pixel783  pixel784  
0       213       165         

In [145]:
print(df.shape)  # Print the shape of the dataset

(6000, 785)


In [146]:
# Split dataset
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train / 255.0
X_test = X_test / 255.0

In [147]:
# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32).reshape(-1, 1, 28, 28)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [148]:
# Create datasets and dataloaders
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, pin_memory=True)

In [149]:
# Define the model
class MyNN(nn.Module):
    def __init__(self, input_features):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(input_features, 32, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [150]:
# Initialize model, loss function, and optimizer
model = MyNN(1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [151]:
# Training loop with profiling
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],  # Profile both CPU and GPU
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),  # Profile 3 steps, repeat twice
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),  # Save logs for TensorBoard
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for epoch in range(2):  # Run for 2 epochs for profiling
        model.train()
        for i, (batch_features, batch_labels) in enumerate(train_loader):
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            # Forward pass
            with record_function("forward"):
                outputs = model(batch_features)
                loss = criterion(outputs, batch_labels)

            # Backward pass
            with record_function("backward"):
                optimizer.zero_grad()
                loss.backward()

            # Optimizer step
            with record_function("optimizer_step"):
                optimizer.step()

            # Profiler step
            prof.step()

            if i >= 10:  # Stop after 10 batches for demonstration
                break

# Print profiling results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                forward         0.00%       0.000us         0.00%       0.000us       0.000us       3.521ms        46.09%       3.521ms       1.174ms           0 b           0 b           0 b           0 

In [152]:
# Define an improved neural network architecture
class MyNN(nn.Module):
    def __init__(self, input_features):
        super().__init__()
        # Feature extraction layers
        self.features = nn.Sequential(
            nn.Conv2d(input_features, 32, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding='same'),  # Additional layer
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        # Classifier layers
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 3 * 3, 256),  # Adjusted input size
            nn.ReLU(),
            nn.Dropout(p=0.4),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(p=0.4),

            nn.Linear(128, 10)
    )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


In [153]:
# Set hyperparameters
learning_rate = 0.001  # Reduced learning rate
epochs = 100  # Increased number of epochs

In [154]:
# Initialize the model and move it to the device
model = MyNN(1)
model.to(device)

# Define the loss function and optimizer (Adam for better convergence)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Add a learning rate scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Reduce LR by 0.1 every 5 epochs

In [155]:
# Training loop with early stopping
best_accuracy = 0
patience = 3  # Number of epochs to wait before stopping
epochs_without_improvement = 0

In [156]:
for epoch in range(epochs):
    model.train()  # Set model to training mode
    total_epoch_loss = 0

    for batch_features, batch_labels in train_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        # Forward pass
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_epoch_loss += loss.item()

    # Calculate average loss for the epoch
    avg_loss = total_epoch_loss / len(train_loader)
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss}')

    # Evaluation on test data
    model.eval()  # Set model to evaluation mode
    total = 0
    correct = 0

    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)

            total += batch_labels.shape[0]
            correct += (predicted == batch_labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy}')

    # Early stopping logic
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save the best model
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print("Early stopping triggered!")
        break

    # Step the learning rate scheduler
    scheduler.step()

Epoch: 1, Loss: 0.9551528696219126
Test Accuracy: 0.8208333333333333
Epoch: 2, Loss: 0.5063882442315419
Test Accuracy: 0.8366666666666667
Epoch: 3, Loss: 0.38721385816733045
Test Accuracy: 0.8183333333333334
Epoch: 4, Loss: 0.325982511639595
Test Accuracy: 0.8391666666666666
Epoch: 5, Loss: 0.2668511484066645
Test Accuracy: 0.8541666666666666
Epoch: 6, Loss: 0.162055803835392
Test Accuracy: 0.865
Epoch: 7, Loss: 0.12905061319470407
Test Accuracy: 0.8541666666666666
Epoch: 8, Loss: 0.10347860485315323
Test Accuracy: 0.8608333333333333
Epoch: 9, Loss: 0.09289832554757595
Test Accuracy: 0.8591666666666666
Early stopping triggered!


In [157]:
# evaluation on training data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in train_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9860416666666667


In [158]:
# Training loop with profiling
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],  # Profile both CPU and GPU
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),  # Profile 3 steps, repeat twice
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),  # Save logs for TensorBoard
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for epoch in range(2):  # Run for 2 epochs for profiling
        model.train()
        for i, (batch_features, batch_labels) in enumerate(train_loader):
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            # Forward pass
            with record_function("forward"):
                outputs = model(batch_features)
                loss = criterion(outputs, batch_labels)

            # Backward pass
            with record_function("backward"):
                optimizer.zero_grad()
                loss.backward()

            # Optimizer step
            with record_function("optimizer_step"):
                optimizer.step()

            # Profiler step
            prof.step()

            if i >= 10:  # Stop after 10 batches for demonstration
                break

# Print profiling results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                forward         0.00%       0.000us         0.00%       0.000us       0.000us       8.840ms       134.43%       8.840ms       2.947ms           0 b           0 b           0 b           0 

In [163]:
!tensorboard --logdir=./logs

2025-02-15 21:23:30.503297: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-15 21:23:30.522611: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-15 21:23:30.528962: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-15 21:23:30.543272: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1739654613.463652   39329 cuda_ex