# Tracking Experimetns With WandB

# Setting Up

In [1]:
!nvidia-smi

Tue Aug 22 11:14:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! pip install -q wandb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m36.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.6/215.6 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [3]:
import torch
import glob2, numpy as np
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Colab Notebooks/ML.Deep-Learning

/content/drive/MyDrive/Colab Notebooks/ML.Deep-Learning


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data

In [7]:
# Augmentations
augmentations = A.Compose([
    A.Resize(256, 256),
    A.RandomCrop(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    ToTensorV2(),
])

In [8]:
# Create Image Dataset
class CapDataset(Dataset):
    def __init__(self, data_dir, augmentations):
        self.data_dir = data_dir
        self.augmentations = augmentations
        self.data = glob2.glob(f"{self.data_dir}/*/*")
        self.data = self.verify_images(self.data)
        self.labels, self.vocab = self.get_labels(self.data)

    def get_labels(self, data_paths):
      labels = [data_path.split("/")[-2] for data_path in data_paths]
      vocab = list(set(labels))
      return labels, vocab

    def verify_images(self, data_paths):

      # Checking Image extensions
      approved_ext = ["jpg", "jpeg", "png"]
      verified_paths = []
      for data_path in data_paths:
        lower_data_path = data_path.lower()
        if lower_data_path.endswith("jpg") or lower_data_path.endswith("jpeg") or lower_data_path.endswith("png"):
          verified_paths.append(data_path)

      # Checking Image channels
      image_paths = []
      for image_path in tqdm(verified_paths):
        image =  np.array(Image.open(image_path))
        if len(image.shape) == 3 and image.shape[-1] == 3:
          image_paths.append(image_path)

      return image_paths

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image_path = self.data[index]
        image = np.array(Image.open(image_path))
        image = self.augmentations(image=image)['image']
        image = image.float()  # convert to float32 tensor
        label = self.labels[index]
        label_idx = self.vocab.index(label)
        return image, label_idx

data_dir = "data"
dataset = CapDataset(data_dir, augmentations)
len(dataset)

  0%|          | 0/3973 [00:00<?, ?it/s]

3843

In [9]:
# Train Validation Split
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(dataset, test_size=0.1)
len(train_data), len(val_data)

(3458, 385)

In [10]:
# Dataset to Dataloader
bs = 32
train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
valid_loader = DataLoader(val_data, batch_size=bs, shuffle=False)

In [15]:
torch.save(train_loader, "dataloaders/train_dls.pkl")
torch.save(valid_loader, "dataloaders/valid_dls.pkl")

In [16]:
train_loader = torch.load("dataloaders/train_dls.pkl")
valid_loader = torch.load("dataloaders/valid_dls.pkl")

# Model

In [17]:
class ResNetClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNetClassifier, self).__init__()
        self.model = models.resnet34(pretrained=True)
        for param in self.model.parameters():
            param.requires_grad = False
        self.model.fc = nn.Linear(512, num_classes)

    def forward(self, x, freeze=True):
        x = self.model(x)
        return x

model = ResNetClassifier(num_classes=20)
model.to(device)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:01<00:00, 67.0MB/s]


ResNetClassifier(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

# Training

In [18]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [21]:
wandb.init(project='cap_recognizer', entity='machinelearning557', name="resnet50")

In [22]:
learning_rate = 1e-3
num_epochs = 10

In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [24]:
def training(model, images, labels):
    # Forward pass
    outputs = model(images)

    # Compute the loss
    loss = criterion(outputs, labels)

    return outputs, loss

In [25]:
def validation(model, images, labels):
    # Forward pass
    outputs = model(images)

    # Compute the loss
    loss = criterion(outputs, labels)

    # Compute the predictions
    _, preds = torch.max(outputs, 1)

    return loss, preds

In [28]:
# Train the model
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Initialize the running loss and accuracy
    running_loss = 0.0
    running_corrects = 0

    # Iterate over the training batches
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        # Move the data to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        outputs, loss = training(model, images, labels)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Update the running loss and accuracy
        running_loss += loss.item() * images.size(0)
        running_corrects += torch.sum(torch.argmax(outputs, dim=1) == labels.data)

    # Compute the epoch loss and accuracy
    epoch_loss = running_loss / 3458
    epoch_acc = running_corrects.double() / 3458

    # Set the model to evaluation mode
    model.eval()

    # Initialize the running loss and accuracy
    running_loss = 0.0
    running_corrects = 0

    # Iterate over the validation batches
    with torch.no_grad():
        for images, labels in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=False):
            # Move the data to the device
            images = images.to(device)
            labels = labels.to(device)

            loss, preds = validation(model, images, labels)

            # Update the running loss and accuracy
            running_loss += loss.item() * images.size(0)
            running_corrects += torch.sum(preds == labels.data)

    # Compute the epoch loss and accuracy
    epoch_loss_val = running_loss / 385
    epoch_acc_val = running_corrects.double() / 385

    scheduler.step()

    # Print the epoch statistics
    tqdm.write(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f} - Train Acc: {epoch_acc:.4f} - Val Loss: {epoch_loss_val:.4f} - Val Acc: {epoch_acc_val:.4f}')

    # Save the model
    torch.save(model.state_dict(), f"models/epoch-{epoch+1}:acc-{epoch_acc_val:.4f}.pth")

    # Track Experiment
    wandb.log({'train_loss': epoch_loss, 'train_acc': epoch_acc, 'val_loss': epoch_loss_val, 'val_acc': epoch_acc_val})

Epoch 1/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 1/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1/10 - Train Loss: 1.1995 - Train Acc: 0.6617 - Val Loss: 1.1991 - Val Acc: 0.6805


Epoch 2/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 2/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2/10 - Train Loss: 1.0636 - Train Acc: 0.6868 - Val Loss: 1.1449 - Val Acc: 0.6831


Epoch 3/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 3/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3/10 - Train Loss: 0.9715 - Train Acc: 0.7189 - Val Loss: 1.1080 - Val Acc: 0.6857


Epoch 4/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 4/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4/10 - Train Loss: 0.8784 - Train Acc: 0.7643 - Val Loss: 1.0762 - Val Acc: 0.7065


Epoch 5/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 5/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5/10 - Train Loss: 0.8486 - Train Acc: 0.7655 - Val Loss: 1.0651 - Val Acc: 0.7013


Epoch 6/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 6/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6/10 - Train Loss: 0.8468 - Train Acc: 0.7727 - Val Loss: 1.0747 - Val Acc: 0.6987


Epoch 7/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 7/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7/10 - Train Loss: 0.8449 - Train Acc: 0.7701 - Val Loss: 1.0558 - Val Acc: 0.7013


Epoch 8/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 8/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8/10 - Train Loss: 0.8412 - Train Acc: 0.7741 - Val Loss: 1.0485 - Val Acc: 0.6987


Epoch 9/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 9/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9/10 - Train Loss: 0.8208 - Train Acc: 0.7753 - Val Loss: 1.0683 - Val Acc: 0.7065


Epoch 10/10 - Training:   0%|          | 0/109 [00:00<?, ?it/s]

Epoch 10/10 - Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10/10 - Train Loss: 0.8276 - Train Acc: 0.7753 - Val Loss: 1.0626 - Val Acc: 0.6961
