In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [1]:
#after mounting drive
%cd /content/drive/MyDrive/DA6401/DA6401_A2

/content/drive/MyDrive/DA6401/DA6401_A2


In [2]:
# !echo '/inaturalist_12K/' >> .gitignore

In [3]:
# !unzip /content/drive/MyDrive/DA6401/DA6401_A2/nature_12K.zip -d /content/drive/MyDrive/DA6401/DA6401_A2

In [4]:
import os
from PIL import Image
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import wandb
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
from torch.utils.data import SubsetRandomSampler
from torchvision.transforms import ToTensor

In [None]:
# !wandb login

In [None]:
# !pip install virtualenv

In [None]:
# !virtualenv /content/drive/MyDrive/.dla2_env

In [None]:
# !bash -c "source /content/drive/MyDrive/.dla2_env/bin/activate && pip install pytorch-lightning"

## Utility code

In [5]:
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, dir, limit=None, transform=None):
        self.data_dir = dir
        self.limit = int(1e10) if limit is None else limit
        self.images = []
        self.labels = []
        self.label_names = []
        i = 0
        for c in os.listdir(dir):
          ims = os.listdir(os.path.join(dir,c))[:self.limit]
          ims = [os.path.join(dir,c,im) for im in ims]
          self.images.extend(ims)
          self.labels.extend([i]*len(ims))
          self.label_names.append(c)
          i += 1
        self.transform = transform

    # Defining the length of the dataset
    def __len__(self):
        return len(self.images)

    # Defining the method to get an item from the dataset
    def __getitem__(self, index):
        image = Image.open(self.images[index])
        image = image.convert("RGB")
        label = self.labels[index]
        # Applying the transform
        if self.transform:
            image = self.transform(image)

        return image, label

In [6]:
class CNN(nn.Module):
    def __init__(self, input_size, depth, filter_size, pool_size, fc_size, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(3, depth, filter_size)
        self.conv2 = nn.Conv2d(depth, depth, filter_size)
        self.conv3 = nn.Conv2d(depth, depth, filter_size)
        self.conv4 = nn.Conv2d(depth, depth, filter_size)
        self.conv5 = nn.Conv2d(depth, depth, filter_size)
        self.pool = nn.MaxPool2d(kernel_size=pool_size)
        with torch.no_grad():
          dummy_input = torch.zeros(1, 3, *input_size)
          out = self._forward_convs(dummy_input)
          flatten_size = out.view(1, -1).shape[1]
        self.fc1 = nn.Linear(flatten_size, fc_size)
        self.fc2 = nn.Linear(fc_size, num_classes)
        # self.loss_fn = nn.CrossEntropyLoss()
        # self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)

    def _forward_convs(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.pool(F.relu(self.conv5(x)))
        return x

    def forward(self, x):
        x = self._forward_convs(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x



In [7]:
def split_validation(label_array, valid_size = 0.1, seed = 42):
  '''
  This functions splits the data into train and validation stratifiedly
  '''
  train_idx, test_idx = [],[]
  for i,ci in enumerate(np.unique(label_array)):
      indices = np.where(label_array==ci)[0]
      train_len = int(indices.shape[0]*(1-valid_size))
      shuffled = np.random.RandomState(seed+i).permutation(indices)
      train_idx.extend(shuffled[:train_len])
      test_idx.extend(shuffled[train_len:])
  return train_idx,test_idx

In [8]:
class Trainer:
  def __init__(self):
    self.input_size = (224,224)
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = None
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.model = None
    self.train_loader = None
    self.val_loader = None
    self.test_loader = None

  def create_model(self, depth,
                   filter_size, pool_size, fc_size):
    num_classes = 10
    model = CNN(input_size=self.input_size, depth=depth,
                filter_size=filter_size, pool_size=pool_size,
                fc_size=fc_size, num_classes=num_classes).to(self.device)
    self.model = model
    self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

  def create_data(self, batch_size, limit):

      TRAIN_DATA_PATH = "inaturalist_12K/train/"
      TEST_DATA_PATH = "inaturalist_12K/val/"

      transform = transforms.Compose([
                transforms.Resize(self.input_size),
                transforms.ToTensor()
              ])

      train_dataset = CustomImageDataset(
          TRAIN_DATA_PATH, limit=limit, transform=transform)
      test_dataset = CustomImageDataset(
          TEST_DATA_PATH, limit=limit, transform=transform)
      train_idx, val_idx = split_validation(
          train_dataset.labels, valid_size=0.2, seed=42)
      self.train_loader = DataLoader(
          dataset=train_dataset, batch_size=batch_size,
          sampler=SubsetRandomSampler(train_idx))
      self.val_loader = DataLoader(
          dataset=train_dataset, batch_size=batch_size,
          sampler=SubsetRandomSampler(val_idx))
      self.test_loader = DataLoader(
          dataset=test_dataset, batch_size=batch_size)

  def run_step(self, dataloader, is_train=False):
      correct = 0
      total = 0
      cum_loss = 0
      if is_train:
        self.model.train()
      else:
        self.model.eval()

      for batch_index, (data, targets) in enumerate(tqdm(dataloader)):
          data = data.to(self.device)
          targets = targets.to(self.device)

          with torch.set_grad_enabled(is_train):
            outputs = self.model(data)
            loss = self.criterion(outputs, targets)
            cum_loss += loss.item()
            _, preds = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (preds == targets).sum().item()
            if is_train:
              self.optimizer.zero_grad()
              loss.backward()
              self.optimizer.step()
      acc = correct/total
      return cum_loss, acc

  def run_train_pipeline(self, n_epochs, use_wandb=False, config=None):
      if use_wandb:
        run = wandb.init(config=config)
      else:
        run = None

      batch_size = config.batch_size
      depth = config.depth
      filter_size = config.filter_size
      pool_size = config.pool_size
      fc_size = config.fc_size

      if run:
        run.name = f"bs:{batch_size}, d:{depth}, fs:{filter_size},\
        ps:{pool_size}, fc:{fc_size}"

      self.create_model(
          depth, filter_size, pool_size, fc_size
      )
      self.create_data(batch_size, 250)

      for epoch in range(n_epochs):
        train_loss, train_acc = self.run_step(self.train_loader, is_train=True)
        val_loss, val_acc = self.run_step(self.val_loader)
        if run:
          wandb.log({
              'epoch':epoch,
              'train_loss':train_loss, 'train_acc':train_acc,
              'val_loss':val_loss, 'val_acc':val_acc
          })
      if run:
        run.finish()

  def test(self):
      self.model.eval()
      test_loss,test_acc = self.run_step(self.test_loader)
      # self.model.train()
      print(f"Loss on test set : {test_loss}")
      print(f"Accuracy on test set : {test_acc}")


## Sample train run

In [21]:
# Set device cuda for GPU if it's available otherwise run on the CPU
# Hyperparameters
from types import SimpleNamespace
batch_size = 256
depth = 16
filter_size = 3
pool_size = 2
fc_size = 120
config = SimpleNamespace(
   batch_size=batch_size,
   depth=depth,
   filter_size=filter_size,
   pool_size=pool_size,
   fc_size=fc_size
)
n_epochs = 3

In [22]:
tr = Trainer()
tr.run_train_pipeline(n_epochs=n_epochs, config=config)

100%|██████████| 8/8 [08:23<00:00, 62.88s/it]
100%|██████████| 2/2 [04:30<00:00, 135.11s/it]
100%|██████████| 8/8 [00:27<00:00,  3.49s/it]
100%|██████████| 2/2 [00:06<00:00,  3.24s/it]
100%|██████████| 8/8 [00:28<00:00,  3.57s/it]
100%|██████████| 2/2 [00:06<00:00,  3.27s/it]


In [24]:
tr.test()

100%|██████████| 8/8 [24:18<00:00, 182.29s/it]

Loss on test set : 18.426812887191772
Accuracy on test set : 0.0955





## Hyperparameter sweep

In [None]:
sweep_config = {
    'method': 'bayes'
    }


In [None]:
sweep_config['project'] = 'DA6401_A2'
sweep_config['entity'] = 'jayagowtham-indian-institute-of-technology-madras'

In [None]:
metric = {
    'name': 'val_acc',
    'goal': 'maximize'
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'optimizer': {
        'values': ['adam', 'sgd']
        },
    'fc_layer_size': {
        'values': [128, 256, 512]
        },
    'dropout': {
          'values': [0.3, 0.4, 0.5]
        },
    }

sweep_config['parameters'] = parameters_dict

In [None]:
sweep_id = wandb.sweep(sweep_config)

In [None]:
tr = Trainer(criterion, optimizer, device)
wandb.agent(sweep_id, tr.train(model, train_loader, val_loader, n_epochs=n_epochs), count=5)

In [None]:
# username="JG-0212"
# passkey="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"
# repository="DA6401_A2"

In [None]:
# !git config user.email "jpsai6594@gmail.com"
# !git config user.name "Jayagowtham"

In [None]:
# !git clone https://{passkey}@github.com/{username}/{repository}.git

In [None]:
# !git reset HEAD~1

In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   Assignment_2.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mlightning_logs/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add -u .

error: read error while indexing Assignment_2.ipynb: No such file or directory
error: Assignment_2.ipynb: failed to insert into database
error: unable to index file 'Assignment_2.ipynb'
fatal: updating files failed


In [None]:
# !git log --oneline

In [None]:
# !git commit -m "Full training network setup. Working fine"

In [None]:
# !git push origin