In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as ttf

import os
import os.path as osp

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np
import math
from dropblock import DropBlock2D # NA-1

In [2]:
!pip install dropblock

Collecting dropblock
  Downloading dropblock-0.3.0-py3-none-any.whl (5.4 kB)
Installing collected packages: dropblock
Successfully installed dropblock-0.3.0


# TODOs
As you go, please read the code and keep an eye out for TODOs!

# Download Data

In [4]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"mangalamsahai","key":"521f66540469b3a12f7b11566d8b1c14"}') # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[?25l[K     |█████▌                          | 10 kB 29.8 MB/s eta 0:00:01[K     |███████████                     | 20 kB 9.2 MB/s eta 0:00:01[K     |████████████████▋               | 30 kB 7.6 MB/s eta 0:00:01[K     |██████████████████████▏         | 40 kB 7.2 MB/s eta 0:00:01[K     |███████████████████████████▊    | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 2.8 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73275 sha256=929116db8c20887fd1ed7f1cb1a027c4dd6de371b454f5b27db5ef0fd5262c03
  Stored in directory: /root/.cache/pip/wheels/de/f7/d8/c3902cacb7e62cb611b1ad343d7cc07f42f7eb76ae3a52f3d1
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    

In [5]:
!kaggle competitions download -c 11-785-s22-hw2p2-classification
!kaggle competitions download -c 11-785-s22-hw2p2-verification

!unzip -q 11-785-s22-hw2p2-classification.zip
!unzip -q 11-785-s22-hw2p2-verification.zip

!ls

Downloading 11-785-s22-hw2p2-classification.zip to /content
100% 2.35G/2.35G [00:09<00:00, 212MB/s]
100% 2.35G/2.35G [00:09<00:00, 271MB/s]
Downloading 11-785-s22-hw2p2-verification.zip to /content
 98% 258M/263M [00:00<00:00, 252MB/s]
100% 263M/263M [00:00<00:00, 289MB/s]
11-785-s22-hw2p2-classification.zip   sample_data
11-785-s22-hw2p2-verification.zip     train_subset
classification			      verification
classification_sample_submission.csv  verification_sample_submission.csv


# Hyperparameters

In [6]:
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
batch_size = 512
lr = 0.2
epochs = 70 # Just for the early submission. We'd want you to train like 50 epochs for your main submissions.

# Very Simple Network

In [7]:
from torch.nn.modules.batchnorm import BatchNorm1d
class Network(nn.Module):
    """
    The Very Low early deadline architecture is a 4-layer CNN.
    The first Conv layer has 64 channels, kernel size 7, and stride 4.
    The next three have 128, 256, and 512 channels. Each have kernel size 3 and stride 2.
    Think about what the padding should be for each layer to not change spatial resolution.
    Each Conv layer is accompanied by a Batchnorm and ReLU layer.
    Finally, you want to average pool over the spatial dimensions to reduce them to 1 x 1.
    Then, remove (Flatten?) these trivial 1x1 dimensions away.
    Look through https://pytorch.org/docs/stable/nn.html 
    TODO: Fill out the model definition below! 

    Why does a very simple network have 4 convolutions?
    Input images are 224x224. Note that each of these convolutions downsample.
    Downsampling 2x effectively doubles the receptive field, increasing the spatial
    region each pixel extracts features from. Downsampling 32x is standard
    for most image models.

    Why does a very simple network have high channel sizes?
    Every time you downsample 2x, you do 4x less computation (at same channel size).
    To maintain the same level of computation, you 2x increase # of channels, which 
    increases computation by 4x. So, balances out to same computation.
    Another intuition is - as you downsample, you lose spatial information. Want
    to preserve some of it in the channel dimension.
    """
    def __init__(self, num_classes=7000):
        super().__init__()

        self.backbone = nn.Sequential(
            # Note that first conv is stride 4. It is (was?) standard to downsample.
            # 4x early on, as with 224x224 images, 4x4 patches are just low-level details.
            # Food for thought: Why is the first conv kernel size 7, not kernel size 3?

            # TODO: Conv group 1
            nn.Conv2d(in_channels=3,out_channels=64,kernel_size=7,stride=4,padding=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            # TODO: Conv group 2
            nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=2,padding=1),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),
            # TODO: Conv group 3
            nn.Conv2d(in_channels=128,out_channels=256,kernel_size=3,stride=2,padding=2),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            # TODO: Conv group 4
            nn.Conv2d(in_channels=256,out_channels=512,kernel_size=3,stride=2,padding=0),
            nn.BatchNorm2d(num_features=512),
            nn.ReLU(),
            # TODO: Average pool over & reduce the spatial dimensions to (1, 1)
            nn.AvgPool3d(kernel_size=(1,7,7)),
            # TODO: Collapse (Flatten) the trivial (1, 1) dimensions
            nn.Flatten()
            ) 
        
        self.cls_layer = nn.Linear(512, num_classes)
    
    def forward(self, x, return_feats=False):
        """
        What is return_feats? It essentially returns the second-to-last-layer
        features of a given image. It's a "feature encoding" of the input image,
        and you can use it for the verification task. You would use the outputs
        of the final classification layer for the classification task.

        You might also find that the classification outputs are sometimes better
        for verification too - try both.
        """
        feats = self.backbone(x)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

# Dataset & DataLoader

In [8]:
"""
Transforms (data augmentation) is quite important for this task.
Go explore https://pytorch.org/vision/stable/transforms.html for more details
"""
DATA_DIR = "/content"
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train") # This is a smaller subset of the data. Should change this to classification/classification/train
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")

train_transforms = [ttf.RandAugment(),
                    ttf.ToTensor(),
                    ttf.ColorJitter(brightness=0.5,hue=0.3),
                    ttf.RandomHorizontalFlip(p=0.5),
                    #ttf.RandomAffine((-15,15)),
                    ] # ttf.RandAugment()- to be tried.
                      #  ttf.RandomAffine((-15,15))
val_transforms = [ttf.ToTensor()]

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_transforms))
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose(val_transforms))


train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=2) # drop_last was True actually

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=1)



MobileNetV2 Model

In [None]:
class InvertedResidualBlock(nn.Module):
    """
    Intuitively, layers in MobileNet can be split into "feature mixing" 
    and "spatial mixing" layers. You can think of feature mixing as each pixel
    "thinking on its own" about its own featuers, and you can think of spatial
    mixing as pixels "talking with each other". Alternating these two builds
    up a CNN.

    In a bit more detail:

    - The purpose of the "feature mixing" layers is what you've already seen in 
    hw1p2. Remember, in hw1p2, we went from some low-level audio input to
    semantically rich representations of phonemes. Featuring mixing is simply a 
    linear layer (a weight matrix) that transforms simpler features into 
    something more advanced.

    - The purpose of the "spatial mixing" layers is to mix features from different
    spatial locations. You can't figure out a face by looking at each pixel on
    its own, right? So we need 3x3 convolutions to mix features from neighboring
    pixels to build up spatially larger features.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes
        self.stride=stride
        self.in_channels=in_channels
        self.out_channels=out_channels
        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        #if (stride == 1 and in_channels == out_channels):
        #    self.do_identity = True
        #else:
        #    self.do_identity = False
        
        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        """
        What is this doing? It's a 1x1 convolutional layer that drastically
        increases the # of channels (feature dimension). 1x1 means each pixel
        is thinking on its own, and increasing # of channels means the network
        is seeing if it can "see" more clearly in a higher dimensional space.

        Some patterns are just more obvious/separable in higher dimensions.

        Also, note that bias = False since BatchNorm2d has a bias term built-in.

        As you go, note the relationship between kernel_size and padding. As you
        covered in class, padding = kernel_size // 2 (kernel_size being odd) to
        make sure input & output spatial resolution is the same.
        """
        self.feature_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels=in_channels,out_channels=hidden_dim,kernel_size=1,stride=1,padding=0),
            nn.BatchNorm2d(num_features=hidden_dim),
            nn.GELU(),
        )

        """
        What is this doing? Let's break it down.
        - kernel_size = 3 means neighboring pixels are talking with each other.
          This is different from feature mixing, where kernel_size = 1.

        - stride. Remember that we sometimes want to downsample spatially. 
          Downsampling is done to reduce # of pixels (less computation to do), 
          and also to increase receptive field (if a face was 32x32, and now
          it's 16x16, a 3x3 convolution covers more of the face, right?). It
          makes sense to put the downsampling in the spatial mixing portion
          since this layer is "in charge" of messing around spatially anyway.

          Note that most of the time, stride is 1. It's just the first block of
          every "stage" (layer \subsetof block \subsetof stage) that we have
          stride = 2.

        - groups = hidden_dim. Remember depthwise separable convolutions in 
          class? If not, it's fine. Usually, when we go from hidden_dim channels
          to hidden_dim channels, they're densely connected (like a linear 
          layer). So you can think of every pixel/grid in an input
          3 x 3 x hidden_dim block being connected to every single pixel/grid 
          in the output 3 x 3 x hidden_dim block.
          What groups = hidden_dim does is remove a lot of these connections.

          Now, each input 3 x 3 block/region is densely connected to the
          corresponding output 3 x 3 block/region. This happens for each of the
          hidden_dim input/output channel pairs independently.
          So we're not even mixing different channels together - we're only 
          mixing spatial neighborhoods. 
          
          Try to draw this out, or come to my (Jinhyung Park)'s OH if you want 
          a more in-depth explanation.
          https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728
        """
        self.spatial_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(hidden_dim,hidden_dim,kernel_size=3,stride=self.stride,padding=1,groups=hidden_dim),
            nn.BatchNorm2d(num_features=hidden_dim),
            nn.ReLU6(),
        )

        """
        What's this? Remember that hidden_dim is quite large - six times the 
        in_channels. So it was nice to do the above operations in this high-dim
        space, where some patterns might be more clear. But we still want to 
        bring it back down-to-earth.

        Intuitively, you can takeaway two reasons for doing this:
        - Reduces computational cost by a lot. 6x in & out channels means 36x
          larger weights, which is crazy. We're okay with just one of input or 
          output of a convolutional layer being large when mixing channels, but 
          not both.
        
        - We also want a residual connection from the input to the output. To 
          do that without introducing another convolutional layer, we want to
          condense the # of channels back to be the same as the in_channels.
          (out_channels and in_channels are usually the same).
        """
        self.bottleneck_channels = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels=hidden_dim,out_channels=out_channels,kernel_size=1,stride=1,padding=0),
            nn.BatchNorm2d(num_features=out_channels),
            nn.ReLU6(),
        )

    def forward(self, x):
        #print("X",x.shape)
        out = self.feature_mixing(x)
        #print("Feature Mixing",out.shape)
        out = self.spatial_mixing(out)
        #print("Spatial Mixing",out.shape)
        out = self.bottleneck_channels(out)
        #print("BottleNeck Mixing",out.shape)

        if self.stride == 1 and self.in_channels == self.out_channels:
                return x + out
        else:
             return out

class MobileNetV2(nn.Module):
    """
    The heavy lifting is already done in InvertedBottleneck.

    Why MobileNetV2 and not V3? V2 is the foundation for V3, which uses "neural
    architecture search" to find better configurations of V2. If you understand
    V2 well, you can totally implement V3!
    """
    def __init__(self, num_classes= 7000):
        super().__init__() # Already features indented from previous class. 

        self.num_classes = num_classes

        """
        First couple of layers are special, just do them here.
        This is called the "stem". Usually, methods use it to downsample or twice.
        """
        
        self.stem = nn.Sequential(
          nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=2,padding=1),
          nn.BatchNorm2d(num_features=32),
          nn.ReLU6(),
        )

        """
        Since we're just repeating InvertedResidualBlocks again and again, we
        want to specify their parameters like this.
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,  16, 1, 1],
            [6,  24, 2, 2], #24  stride: 22
            [6,  32, 3, 2], #32  stride: 21
            # 1[6,  64, 4, 2], #64  stride: 22
            [6,  96, 3, 1], #96  stride: 11
            [6, 160, 3, 2], #160 stride: 22
            [6, 320, 1, 1], #320 stride: 11
           # 512
           # [6, 480, 1, 1],
           # [6, 640, 1, 1],
        ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 32

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            
            for block_idx in range(num_blocks):
                out_channels = num_channels
                #print("in_channel",in_channels)
                #print("out_channel",out_channels)
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels=in_channels,out_channels=1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6(),
        )

        # Now, we need to build the final classification layer.
         
        self.cls_layer = nn.Sequential(
            # TODO: Fill this in!
            # Pool over & collapse the spatial dimensions to (1, 1)
            # Collapse the trivial (1, 1) dimensions
            # Project to our # of classes
            nn.AvgPool2d(kernel_size=(7,7),stride=1),
            nn.Flatten(),
            nn.Linear(1280,num_classes),
        )
         
        
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        out = self.cls_layer(out)

        return out

# My RESNET-34 Model

In [9]:
class StageLayers(nn.Module):
  
  def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                ):
        super().__init__() # Just have to do this for all nn.Module classes

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        #self.drop_block = DropBlock2D(block_size=3, drop_prob=0.3)        
        self.Layer = nn.Sequential(
            nn.Conv2d(in_channels=in_channels,out_channels=in_channels,kernel_size=1,stride=self.stride,padding=0),
            nn.BatchNorm2d(num_features=in_channels),
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels,out_channels=in_channels,kernel_size=3,stride=1,padding=1),
            nn.BatchNorm2d(num_features=in_channels),
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels,out_channels=out_channels,kernel_size=1,stride=1,padding=0),
            nn.BatchNorm2d(num_features=out_channels),
            #nn.Conv2d(in_channels=in_channel,out_channels=in_channel*4,kernel_size=1,stride=1,padding=0),
         )
        
  def forward(self, x):
        
        out = self.Layer(x)

        

        if self.in_channels==self.out_channels and x.shape[2]==out.shape[2]:
             #print(out.shape)
             #print(x.shape)
             return out+x
        else:
             return out   
        
        
class ResNet(nn.Module):
     def __init__(self, num_classes= 7000):
         super().__init__() # Already features indented from previous class. 
         #self.drop_block = DropBlock2D(block_size=3, drop_prob=0.3)  
         self.num_classes = num_classes

         self.stem = nn.Sequential(
            nn.Conv2d(in_channels=3,out_channels=64,kernel_size=7,stride=2,padding=3),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            )
        

         self.stage1_maxpool = nn.Sequential(
           nn.MaxPool2d(kernel_size=3,stride=2,padding=1), 
            )
        # 56*56
         self.stage_cfgs = [
              # in_channel, #blocks
            [64, 128, 3],
            [128, 256, 4], 
            [256, 512, 6], 
            [512, 512, 3], 
            ]
                           
         layers = []
         #pdb.set_trace()
         i=0
         for curr_stage in self.stage_cfgs:
             in_channels, out_channels, num_blocks = curr_stage
             for block_idx in range(num_blocks):
                 print(i)
                 print(block_idx)
                 if block_idx==0: 
                       if i==0:
                          stride=1
                          
                       else:
                          stride=2
                                  
                 else: 
                       stride=1
                 layers.append(StageLayers(
                 in_channels = in_channels,
                 out_channels = out_channels if block_idx==num_blocks-1 else in_channels, 
                 stride= stride, 
                 ))
            
             i=i+1  
                 
                # in_channels=num_channels
                # In channels of the next block is the out_channels of the current one
                # in_channels = in_channels*4 
            
         self.layers = nn.Sequential(*layers) # Done, save them to the class

       
         self.cls_layer = nn.Sequential(
             #nn.Dropout(p=0.1),
             nn.AdaptiveAvgPool2d((1,1)),
             nn.Flatten(),
             nn.Linear(512,num_classes),
           )

      #   self._initialize_weights()

     #def _initialize_weights(self):
     #   """
     #   Usually, I like to use default pytorch initialization for stuff, but
     #   MobileNetV2 made a point of putting in some custom ones, so let's just
     #   use them.
     #   """
     #   for m in self.modules():
     #       if isinstance(m, nn.Conv2d):
     #          n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
     #           m.weight.data.normal_(0, math.sqrt(2. / n))
     #           if m.bias is not None:
     #              m.bias.data.zero_()
     #       elif isinstance(m, nn.BatchNorm2d):
     #           m.weight.data.fill_(1)
     #          m.bias.data.zero_()
     #       elif isinstance(m, nn.Linear):
     #           m.weight.data.normal_(0, 0.01)
     #          m.bias.data.zero_()

     def forward(self, x):
        out = self.stem(x)
        #out = self.drop_block(out)
        out = self.stage1_maxpool(out)
        out = self.layers(out)
        #print(out.shape)
        out = self.cls_layer(out)
        
        

        return out

MobileNet

In [None]:
class InvertedResidualBlock(nn.Module):
    """
    Intuitively, layers in MobileNet can be split into "feature mixing" 
    and "spatial mixing" layers. You can think of feature mixing as each pixel
    "thinking on its own" about its own featuers, and you can think of spatial
    mixing as pixels "talking with each other". Alternating these two builds
    up a CNN.

    In a bit more detail:

    - The purpose of the "feature mixing" layers is what you've already seen in 
    hw1p2. Remember, in hw1p2, we went from some low-level audio input to
    semantically rich representations of phonemes. Featuring mixing is simply a 
    linear layer (a weight matrix) that transforms simpler features into 
    something more advanced.

    - The purpose of the "spatial mixing" layers is to mix features from different
    spatial locations. You can't figure out a face by looking at each pixel on
    its own, right? So we need 3x3 convolutions to mix features from neighboring
    pixels to build up spatially larger features.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes
        self.stride=stride
        self.in_channels=in_channels
        self.out_channels=out_channels
        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        #if (stride == 1 and in_channels == out_channels):
        #    self.do_identity = True
        #else:
        #    self.do_identity = False
        
        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        """
        What is this doing? It's a 1x1 convolutional layer that drastically
        increases the # of channels (feature dimension). 1x1 means each pixel
        is thinking on its own, and increasing # of channels means the network
        is seeing if it can "see" more clearly in a higher dimensional space.

        Some patterns are just more obvious/separable in higher dimensions.

        Also, note that bias = False since BatchNorm2d has a bias term built-in.

        As you go, note the relationship between kernel_size and padding. As you
        covered in class, padding = kernel_size // 2 (kernel_size being odd) to
        make sure input & output spatial resolution is the same.
        """
        self.feature_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels=in_channels,out_channels=hidden_dim,kernel_size=1,stride=1,padding=0),
            nn.BatchNorm2d(num_features=hidden_dim),
            nn.ReLU6(),
        )

        """
        What is this doing? Let's break it down.
        - kernel_size = 3 means neighboring pixels are talking with each other.
          This is different from feature mixing, where kernel_size = 1.

        - stride. Remember that we sometimes want to downsample spatially. 
          Downsampling is done to reduce # of pixels (less computation to do), 
          and also to increase receptive field (if a face was 32x32, and now
          it's 16x16, a 3x3 convolution covers more of the face, right?). It
          makes sense to put the downsampling in the spatial mixing portion
          since this layer is "in charge" of messing around spatially anyway.

          Note that most of the time, stride is 1. It's just the first block of
          every "stage" (layer \subsetof block \subsetof stage) that we have
          stride = 2.

        - groups = hidden_dim. Remember depthwise separable convolutions in 
          class? If not, it's fine. Usually, when we go from hidden_dim channels
          to hidden_dim channels, they're densely connected (like a linear 
          layer). So you can think of every pixel/grid in an input
          3 x 3 x hidden_dim block being connected to every single pixel/grid 
          in the output 3 x 3 x hidden_dim block.
          What groups = hidden_dim does is remove a lot of these connections.

          Now, each input 3 x 3 block/region is densely connected to the
          corresponding output 3 x 3 block/region. This happens for each of the
          hidden_dim input/output channel pairs independently.
          So we're not even mixing different channels together - we're only 
          mixing spatial neighborhoods. 
          
          Try to draw this out, or come to my (Jinhyung Park)'s OH if you want 
          a more in-depth explanation.
          https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728
        """
        self.spatial_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(hidden_dim,hidden_dim,kernel_size=3,stride=self.stride,padding=1,groups=hidden_dim,bias=False),
            nn.BatchNorm2d(num_features=hidden_dim),
            nn.ReLU6(),
        )

        """
        What's this? Remember that hidden_dim is quite large - six times the 
        in_channels. So it was nice to do the above operations in this high-dim
        space, where some patterns might be more clear. But we still want to 
        bring it back down-to-earth.

        Intuitively, you can takeaway two reasons for doing this:
        - Reduces computational cost by a lot. 6x in & out channels means 36x
          larger weights, which is crazy. We're okay with just one of input or 
          output of a convolutional layer being large when mixing channels, but 
          not both.
        
        - We also want a residual connection from the input to the output. To 
          do that without introducing another convolutional layer, we want to
          condense the # of channels back to be the same as the in_channels.
          (out_channels and in_channels are usually the same).
        """
        self.bottleneck_channels = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels=hidden_dim,out_channels=out_channels,kernel_size=1,stride=1,padding=0,bias=False),
            nn.BatchNorm2d(num_features=out_channels),
        )

    def forward(self, x):
        #print("X",x.shape)
        out = self.feature_mixing(x)
        #print("Feature Mixing",out.shape)
        out = self.spatial_mixing(out)
        #print("Spatial Mixing",out.shape)
        out = self.bottleneck_channels(out)
        #print("BottleNeck Mixing",out.shape)

        if self.stride == 1 and self.in_channels == self.out_channels:
                return x + out
        else:
             return out

class MobileNet(nn.Module):
    """
    The heavy lifting is already done in InvertedBottleneck.

    Why MobileNetV2 and not V3? V2 is the foundation for V3, which uses "neural
    architecture search" to find better configurations of V2. If you understand
    V2 well, you can totally implement V3!
    """
    def __init__(self, num_classes= 7000):
        super().__init__() # Already features indented from previous class. 

        self.num_classes = num_classes

        """
        First couple of layers are special, just do them here.
        This is called the "stem". Usually, methods use it to downsample or twice.
        """
        
        self.stem = nn.Sequential(
          nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=2,padding=1,bias=False),
          nn.BatchNorm2d(num_features=32),
          nn.ReLU6(),
          nn.Conv2d(in_channels=32,out_channels=32,kernel_size=3,stride=1,padding=1,bias=False,groups=32),
          nn.BatchNorm2d(num_features=32),
          nn.ReLU6(),
          nn.Conv2d(in_channels=32,out_channels=16,kernel_size=1,stride=1,padding=0,bias=False),
          nn.BatchNorm2d(num_features=16),
        )

        """
        Since we're just repeating InvertedResidualBlocks again and again, we
        want to specify their parameters like this.
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,  24, 2, 2], #24  stride: 2
            [6,  32, 3, 2], #32  stride: 2
            [6,  64, 4, 2], #64  stride: 2
            [6,  96, 3, 1], #96  stride: 1
            [6, 160, 3, 2], #160 stride: 2
            [6, 320, 1, 1], #320 stride: 1
           ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 16

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            out_channels = num_channels
            for block_idx in range(num_blocks):
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels=in_channels,out_channels=1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6(),
        )

        # Now, we need to build the final classification layer.
         
        self.cls_layer = nn.Sequential(
            # TODO: Fill this in!
            # Pool over & collapse the spatial dimensions to (1, 1)
            # Collapse the trivial (1, 1) dimensions
            # Project to our # of classes
            nn.AvgPool2d(kernel_size=(7,7)),
            nn.Flatten(),
            nn.Linear(1280,num_classes),
        )
         
        
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        out = self.cls_layer(out)

        return out

In [None]:
model.load_state_dict(model.state_dict(),"/content/drive/MyDrive/model_epoch_18.pth")
optimizer.load_state_dict(model.state_dict(),'/content/drive/MyDrive/model_epoch_18.pth')

TypeError: ignored

## Model Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
save_checkpoint = torch.load("/content/drive/MyDrive/model_epoch_15.pth")

In [None]:
model.load_state_dict(save_checkpoint['model_state_dict'])

In [None]:
optimizer.load_state_dict(save_checkpoint['optimizer_state_dict'])

In [None]:
epoch = save_checkpoint['epoch']

# Setup everything for training

In [None]:
#import pdb
model = ResNet()
#model.load_state_dict(model.state_dict(),"/content/drive/MyDrive/model_epoch_18.pth")
model.cuda()


# For this homework, we're limiting you to 35 million trainable parameters, as
# outputted by this. This is to help constrain your search space and maintain
# reasonable training times & expectations
num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))

# TODO: What criterion do we use for this task?
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.2)   #+ torch.nn.Triplet Loss                     #nn.PairwiseDistance()                      #nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)  # weight decay original value: 1e-4
#optimizer.load_state_dict(model.state_dict(),"/content/drive/MyDrive/model_epoch_18.pth")
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))
# T_max is "how many times will i call scheduler.step() until it reaches 0 lr?"

# For this homework, we strongly strongly recommend using FP16 to speed up training.
# It helps more for larger models.
# Go to https://effectivemachinelearning.com/PyTorch/8._Faster_training_with_mixed_precision
# and compare "Single precision training" section with "Mixed precision training" section
scaler = torch.cuda.amp.GradScaler()

0
0
0
1
0
2
1
0
1
1
1
2
1
3
2
0
2
1
2
2
2
3
2
4
2
5
3
0
3
1
3
2
Number of Params: 17554136


# Let's train!

In [None]:
for epoch in range(epochs):
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0

    for i, (x, y) in enumerate(train_loader):
        #print("y",y.shape)
        #print("x",x.shape)
        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()
        
        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            outputs = model(x)
            loss = criterion(outputs, y)

        # Update # correct & loss as we go
        num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        total_loss += float(loss)

        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar

    # You can add validation per-epoch here if you would like
    

    print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
        epoch + 1,
        epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr'])))
    #torch.save(model.state_dict(),"/content/drive/MyDrive/model_epoch_"+str(epoch)+".pth")
    checkpoint= {
               'epoch': epoch,
               'model_state_dict': model.state_dict(),
               'optimizer_state_dict': optimizer.state_dict(),
               }
    torch.save(checkpoint,"/content/drive/MyDrive/model_epoch_"+str(epoch)+".pth")           
    
            



Epoch 1/70: Train Acc 99.2681%, Train Loss 2.6456, Learning Rate 0.0817




Epoch 2/70: Train Acc 99.2645%, Train Loss 2.6452, Learning Rate 0.0816




Epoch 3/70: Train Acc 99.3497%, Train Loss 2.6403, Learning Rate 0.0814




Epoch 4/70: Train Acc 99.3418%, Train Loss 2.6392, Learning Rate 0.0811




Epoch 5/70: Train Acc 99.2881%, Train Loss 2.6402, Learning Rate 0.0808




Epoch 6/70: Train Acc 99.3318%, Train Loss 2.6359, Learning Rate 0.0803




Epoch 7/70: Train Acc 99.3404%, Train Loss 2.6355, Learning Rate 0.0798




Epoch 8/70: Train Acc 99.3733%, Train Loss 2.6318, Learning Rate 0.0792




Epoch 9/70: Train Acc 99.3854%, Train Loss 2.6265, Learning Rate 0.0785




Epoch 10/70: Train Acc 99.4005%, Train Loss 2.6215, Learning Rate 0.0777




Epoch 11/70: Train Acc 99.4155%, Train Loss 2.6197, Learning Rate 0.0769




Epoch 12/70: Train Acc 99.4727%, Train Loss 2.6127, Learning Rate 0.0760




Epoch 13/70: Train Acc 99.4878%, Train Loss 2.6082, Learning Rate 0.0750




Epoch 14/70: Train Acc 99.4892%, Train Loss 2.6073, Learning Rate 0.0740




Epoch 15/70: Train Acc 99.5207%, Train Loss 2.6015, Learning Rate 0.0729




Epoch 16/70: Train Acc 99.5457%, Train Loss 2.5954, Learning Rate 0.0717




Epoch 17/70: Train Acc 99.5979%, Train Loss 2.5891, Learning Rate 0.0705


Train:  82%|████████▏ | 224/273 [13:49<02:38,  3.24s/it, acc=99.6425%, loss=2.5810, lr=0.0694, num_correct=114278]

In [None]:
! ls /content/drive

MyDrive


# Classification Task: Validation

## Load the Required Model/.pth file

In [None]:
device = torch.device("cuda")
model = ResNet().cuda() # NA 18 cuda()
model.load_state_dict(torch.load("/content/model_epoch_36.pth"))

0
0
0
1
0
2
1
0
1
1
1
2
1
3
2
0
2
1
2
2
2
3
2
4
2
5
3
0
3
1
3
2


<All keys matched successfully>

In [None]:
model.eval()
batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
num_correct = 0
for i, (x, y) in enumerate(val_loader):

    x = x.cuda()
    y = y.cuda()

    with torch.no_grad():
        outputs = model(x)

    num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

    batch_bar.update()
    
batch_bar.close()
print("Validation: {:.04f}%".format(100 * num_correct / len(val_dataset)))

                                                                  

Validation: 80.1200%




# Classification Task: Submit to Kaggle

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose(val_transforms))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, 
                         drop_last=False, num_workers=1)

In [None]:
model.eval()
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
res = []
for i, (x) in enumerate(test_loader):
    x= x.cuda()
    
    # TODO: Finish predicting on the test set.
    with torch.no_grad():
         output=model(x)
    
    pred = torch.argmax(output,axis=1)
    res.extend(pred)

    batch_bar.update()
    
batch_bar.close()



In [None]:
with open("classification_early_submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(test_dataset)):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-classification -f classification_early_submission.csv -m "New Submission"

100% 541k/541k [00:04<00:00, 127kB/s]
Successfully submitted to Face Recognition

# Verification Task: Validation

There are 6K verification dev images, but 166K "pairs" for you to compare. So, it's much more efficient to compute the features for the 6K verification images, and just compare afterwards.

This will be done by creating a dictionary mapping the image file names to the features. Then, you'll use this dictionary to compute the similarities for each pair.

In [None]:
!ls verification/verification/dev | wc -l
!cat verification/verification/verification_dev.csv | wc -l

6000
166801


In [None]:
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

In [None]:
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose(val_transforms))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)

In [None]:
#checkpoint = torch.load('model_epoch_19.pth')
model.load_state_dict(torch.load('model_epoch_42.pth')['model_state_dict'])
optimizer.load_state_dict(torch.load('model_epoch_42.pth')['optimizer_state_dict'])

<All keys matched successfully>

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try the final outputs too!
        feats = model(imgs)                  #return_feats=True
        print(feats.size())
    # TODO: Now we have features and the image path names. What to do with them?
        #Convert tuple(path_names) into list(path_names)
        i=0
        
        for pathnames in list(path_names):
          #print(i)
          X = osp.join("dev/",pathnames)
          #feats_dict1 = {X:feats[i]}
          #print(i)
          #feats_dict = feats_dict.update({X:feats[i]})
          #feats_dict[(batch_idx,X)] = {feats[i]}  #("dev/"+str(pathnames)
          #feats_dict[(batch_idx,X)] = {feats[i]}
          feats_dict[X] = feats[i]
          i=i+1
    # Hint: use the feats_dict somehow.

  8%|▊         | 1/12 [00:01<00:13,  1.26s/it]

torch.Size([512, 7000])


 17%|█▋        | 2/12 [00:02<00:11,  1.13s/it]

torch.Size([512, 7000])


 25%|██▌       | 3/12 [00:03<00:10,  1.11s/it]

torch.Size([512, 7000])


 33%|███▎      | 4/12 [00:04<00:08,  1.06s/it]

torch.Size([512, 7000])


 42%|████▏     | 5/12 [00:05<00:07,  1.07s/it]

torch.Size([512, 7000])


 50%|█████     | 6/12 [00:06<00:06,  1.04s/it]

torch.Size([512, 7000])


 58%|█████▊    | 7/12 [00:07<00:05,  1.06s/it]

torch.Size([512, 7000])


 67%|██████▋   | 8/12 [00:08<00:04,  1.03s/it]

torch.Size([512, 7000])


 75%|███████▌  | 9/12 [00:09<00:03,  1.05s/it]

torch.Size([512, 7000])


 83%|████████▎ | 10/12 [00:10<00:02,  1.02s/it]

torch.Size([512, 7000])


 92%|█████████▏| 11/12 [00:11<00:01,  1.04s/it]

torch.Size([512, 7000])


                                               

torch.Size([368, 7000])




In [None]:
# What does this dict look like?
print(list(feats_dict.items())[1])

('dev/000f15b775.jpg', tensor([1.6615e-02, 0.0000e+00, 5.0548e-03, 0.0000e+00, 1.6475e-02, 2.0388e-04,
        0.0000e+00, 3.2699e-04, 1.3654e-02, 3.7086e-03, 1.2699e-02, 7.9263e-04,
        0.0000e+00, 5.1147e-04, 0.0000e+00, 1.3044e-02, 2.3715e-02, 1.9935e-03,
        1.2184e-02, 5.6251e-03, 2.3293e-03, 1.5844e-02, 1.9118e-04, 2.3009e-04,
        0.0000e+00, 0.0000e+00, 6.1835e-05, 1.0891e-02, 2.1861e-02, 5.2396e-04,
        5.3622e-02, 2.2622e-02, 4.5036e-03, 2.2157e-04, 0.0000e+00, 0.0000e+00,
        1.6733e-03, 4.6967e-02, 2.4195e-02, 1.6687e-02, 9.7350e-03, 2.2226e-02,
        1.6410e-02, 1.7660e-02, 2.7316e-03, 0.0000e+00, 1.1201e-02, 1.4850e-04,
        3.1912e-02, 3.7956e-02, 2.5858e-03, 1.5431e-02, 2.8290e-02, 3.5897e-05,
        8.9828e-04, 1.2853e-02, 1.4399e-02, 4.7612e-03, 2.7833e-02, 1.5064e-03,
        1.1436e-03, 3.3553e-02, 3.7235e-02, 8.4303e-03, 1.9387e-03, 2.5172e-02,
        2.9869e-03, 2.7442e-04, 1.2953e-02, 2.3115e-04, 0.0000e+00, 2.6940e-02,
        1.6744e-0

In [None]:
len(feats_dict.items())

6000

In [None]:
for keys in list(feats_dict.keys()):
    print(keys[0])
    print(keys[1])
    keys1=keys 
    


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e
d
e

In [None]:
list(X)[0]

tensor([1.7886e-02, 2.2069e-04, 1.3272e-02, 6.9363e-04, 9.5688e-03, 0.0000e+00,
        6.5521e-05, 2.5615e-03, 1.8756e-02, 9.7871e-03, 7.4027e-03, 1.0770e-03,
        1.6101e-03, 2.2303e-03, 0.0000e+00, 2.2621e-02, 3.0591e-02, 3.5864e-04,
        2.6656e-02, 7.9551e-03, 0.0000e+00, 3.0935e-02, 3.9138e-05, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.1283e-03, 1.9843e-02, 2.4212e-02, 3.8281e-04,
        7.8426e-02, 1.4956e-02, 4.9265e-03, 0.0000e+00, 0.0000e+00, 1.6317e-03,
        2.1364e-03, 6.9883e-02, 1.8867e-02, 2.5655e-02, 1.8613e-02, 3.5373e-02,
        2.6826e-02, 3.1326e-02, 6.1110e-03, 4.5165e-04, 1.6743e-02, 3.0892e-04,
        3.8503e-02, 4.1356e-02, 9.8160e-03, 1.7986e-02, 3.6644e-02, 0.0000e+00,
        8.0677e-04, 2.3378e-02, 1.0091e-02, 1.0237e-02, 3.5172e-02, 6.8937e-03,
        4.5925e-03, 4.3300e-02, 4.8760e-02, 8.6678e-03, 3.7915e-03, 3.5341e-02,
        6.1794e-03, 7.5579e-04, 1.2173e-02, 1.7853e-03, 0.0000e+00, 3.2194e-02,
        7.7248e-03, 9.7105e-03, 0.0000e+

In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
# similarity_metric = 

similarity_metric = nn.CosineSimilarity(dim=0)

val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")

#feats_dict = feats_dict.cpu()

# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
gt_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2, gt = line.split(",")

    # TODO: Use the similarity metric
    # How to use these img_paths? What to do with the features?
    # similarity = similarity_metric(...)
    #res1=-3
    #res2=-3
    
    #for batch_idx in range(0,len(feats_dict.values())):
       # Feature img_path1
     #  for keys in list(feats_dict[batch_idx].keys()):
      #   if keys == img_path1:
       #    X = feats_dict[batch_idx].get(keys)
        #   torch.as_tensor(X)
         #  print('1')
         #if keys == img_path2:
         #  Y = feats_dict[batch_idx].get(keys)
         # torch.as_tensor(Y)
         #  print('2')  
    #similarity = similarity_metric(X.cpu(),Y.cpu())
    #similarity = torch.cdist(feats_dict[img_path1].cpu(),feats_dict[img_path2].cpu()) 

      #res1 = [idx for idx, key in enumerate(list(feats_dict[batch_idx].items())) if key[0] == img_path1 ]
      #res2 = [idx for idx, key in enumerate(list(feats_dict[batch_idx].items())) if key[0] == img_path2 ]

      #if res1!=-3 or res2!=-3:
    
    #for batch_idx in range(0,len(feats_dict.values())): 
    #  for keys in list(feats_dict.keys()):
    #     if (keys[1] == img_path1):   
    #        Z1 = feats_dict[keys]

    #for batch_idx in range(0,len(feats_dict.values())): 
    #  for keys in list(feats_dict.keys()):
    #     if keys[1] == img_path2:   
    #        Z2 = feats_dict[keys]       
    #print(Z1)
    #print(Z2)     
    #Z1 = list(Z1)   
    #Z2 = list(Z2)          
    
    similarity = similarity_metric(feats_dict[img_path1].cpu(),feats_dict[img_path2].cpu())

    #similarity = similarity_metric(feats_dict[batch_idx][i][j])
    pred_similarities.append(similarity)
    gt_similarities.append(int(gt))

t_cpu = pred_similarities
#horizontal_translation_numpy = pred_similarities.cpu()
#print(pred_similarities[1:3])
pred_similarities = np.array(t_cpu)
gt_similarities = np.array(gt_similarities)

print("AUC:", roc_auc_score(gt_similarities, pred_similarities))



AUC: 0.9347770498132866


# Verification Task: Submit to Kaggle

In [None]:
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"),
                                        ttf.Compose(val_transforms))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=batch_size, 
                                              shuffle=False, num_workers=1)

In [None]:
model.load_state_dict(torch.load('model_epoch_42.pth'))

<All keys matched successfully>

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!
        feats = model(imgs) 
       
    i=0
    # TODO: Now we have features and the image path names. What to do with them?
    for pathnames in path_names:      
          X = osp.join("test/",pathnames)   
          feats_dict[X]= feats[i]
          i=i+1
    # Hint: use the feats_dict somehow.



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
# similarity_metric = 
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")

similarity_metric = nn.CosineSimilarity(dim=0)

# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2 = line.split(",")

    # TODO: Finish up verification testing.
    similarity = similarity_metric(feats_dict[img_path1].cpu(),feats_dict[img_path2].cpu())

    pred_similarities.append(similarity)
    
    # How to use these img_paths? What to do with the features?
t_cpu = pred_similarities
#horizontal_translation_numpy = pred_similarities.cpu()
#print(pred_similarities[1:3])
pred_similarities = np.array(t_cpu)



In [None]:
len(pred_similarities)

667600

In [None]:
with open("verification_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-verification -f verification_submission.csv -m New_Submission

100% 17.2M/17.2M [00:04<00:00, 4.28MB/s]
Successfully submitted to Face Verification

# Extras

In [None]:
# If you keep re-initializing your model in Colab, can run out of GPU memory, need to restart.
# These three lines can help that - run this before you re-initialize your model

del model
torch.cuda.empty_cache()
!nvidia-smi

Fri Feb 18 07:47:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    32W / 250W |   2951MiB / 16280MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model.load_state_dict(torch.load('model_epoch_42.pth')['model_state_dict'])
optimizer.load_state_dict(torch.load('model_epoch_42.pth')['optimizer_state_dict'])

In [None]:
model.load_state_dict(torch.load('model_epoch_42.pth')['model_state_dict'])
optimizer.load_state_dict(torch.load('model_epoch_42.pth')['optimizer_state_dict'])

In [None]:
9

In [None]:
save_checkpoint = torch.load("/content/drive/MyDrive/model_epoch_9.pth")