In [1]:
# !pip install torchdynamo

In [2]:
import cv2
import glob
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torch import nn, optim
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights, vgg11, VGG11_Weights
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url
from torchsummary import summary
from PIL import Image
from google.colab import drive

In [3]:
!pip install timm

Collecting timm
  Downloading timm-0.9.12-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: timm
Successfully installed timm-0.9.12


In [4]:
import timm

### Create a Label Map

In [5]:
labels = [
    {'name': 'hello', 'id': 1},
    {'name': 'yes', 'id': 2},
    {'name': 'no', 'id': 3},
    {'name': 'thanks', 'id': 4},
    {'name': 'i love you', 'id': 5},
]

In [6]:
!mkdir "/content/annotations/"

In [7]:
with open('/content/annotations/label_map.pbtxt', 'w') as f:
    for label in labels:
        f.write('item { \n')
        f.write(f'\tname:\'{label["name"]}\'\n')
        f.write(f'\tid:{label["id"]}\n')
        f.write('}\n')

### Connecting to Colab to get Images

In [8]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!unzip "/content/gdrive/MyDrive/IA/signLanguageImages.zip" -d "/content/images"

Archive:  /content/gdrive/MyDrive/IA/signLanguageImages.zip
   creating: /content/images/test/
  inflating: /content/images/test/hello.1636369e-b0f2-11ee-bc1a-5b91ca12c195.jpg  
  inflating: /content/images/test/hello.1636369e-b0f2-11ee-bc1a-5b91ca12c195.xml  
  inflating: /content/images/test/hello.17697030-b0f2-11ee-bc1a-5b91ca12c195.jpg  
  inflating: /content/images/test/hello.17697030-b0f2-11ee-bc1a-5b91ca12c195.xml  
  inflating: /content/images/test/iloveyou.643300ca-b0f2-11ee-bc1a-5b91ca12c195.jpg  
  inflating: /content/images/test/iloveyou.643300ca-b0f2-11ee-bc1a-5b91ca12c195.xml  
  inflating: /content/images/test/iloveyou.65662e54-b0f2-11ee-bc1a-5b91ca12c195.jpg  
  inflating: /content/images/test/iloveyou.65662e54-b0f2-11ee-bc1a-5b91ca12c195.xml  
  inflating: /content/images/test/no.54990236-b0f2-11ee-bc1a-5b91ca12c195.jpg  
  inflating: /content/images/test/no.54990236-b0f2-11ee-bc1a-5b91ca12c195.xml  
  inflating: /content/images/test/no.58320d48-b0f2-11ee-bc1a-5b91ca12

In [10]:
class SignDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None,
                 device='cpu'):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.device = device

        self.image_label_pairs = []
        split_dir = os.path.join(root_dir, split)

        for filename in os.listdir(split_dir):
            if filename.endswith('.jpg'):
                img_path = os.path.join(split_dir, filename)
                label_filename = os.path.splitext(filename)[0] + '.xml'
                label_path = os.path.join(split_dir, label_filename)
                if os.path.exists(label_path):
                    self.image_label_pairs.append((img_path, label_path))

    def __len__(self):
        return len(self.image_label_pairs)

    def __getitem__(self, idx):
        img_path, label_path = self.image_label_pairs[idx]

        # Load Image
        img = Image.open(img_path).convert('RGB')

        # Load XML file and extract label information
        tree = ET.parse(label_path)
        root = tree.getroot()

        # Extract label information
        object_elem = root.find('object')
        class_name = object_elem.find('name').text
        bbox_elem = object_elem.find('bndbox')
        xmin = int(bbox_elem.find('xmin').text)
        ymin = int(bbox_elem.find('ymin').text)
        xmax = int(bbox_elem.find('xmax').text)
        ymax = int(bbox_elem.find('ymax').text)

        # Create label information
        label_info = {
            'class_name': class_name,
            'bbox': [xmin, ymin, xmax, ymax]
        }

        # Apply transformations
        if self.transform:
            img = self.transform(img)

        return {'image': img, 'label': label_info}

### Data Preprocessing

In [11]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Custom Dataset

In [13]:
train_dataset = SignDataset(root_dir='/content/images', split='train',
                            transform=transform, device=device)
test_dataset = SignDataset(root_dir='/content/images', split='test',
                           transform=transform, device=device)

In [44]:
test_dataset[0]

{'image': tensor([[[0.2863, 0.2549, 0.2510,  ..., 0.2392, 0.2431, 0.2471],
          [0.2863, 0.2549, 0.2510,  ..., 0.2314, 0.2353, 0.2353],
          [0.2667, 0.2431, 0.2549,  ..., 0.2196, 0.2235, 0.2196],
          ...,
          [0.1059, 0.1137, 0.1059,  ..., 0.1098, 0.1176, 0.1137],
          [0.1176, 0.1176, 0.1098,  ..., 0.1098, 0.1098, 0.1098],
          [0.1020, 0.1137, 0.1216,  ..., 0.0980, 0.1176, 0.1176]],
 
         [[0.3333, 0.3176, 0.3137,  ..., 0.2118, 0.2118, 0.2078],
          [0.3412, 0.3176, 0.3137,  ..., 0.2196, 0.2275, 0.2196],
          [0.3451, 0.3255, 0.3255,  ..., 0.2196, 0.2353, 0.2275],
          ...,
          [0.1059, 0.1137, 0.1059,  ..., 0.0941, 0.0941, 0.0980],
          [0.1255, 0.1255, 0.1176,  ..., 0.0941, 0.0863, 0.0863],
          [0.0980, 0.1098, 0.1137,  ..., 0.0902, 0.0902, 0.0941]],
 
         [[0.3451, 0.3490, 0.3647,  ..., 0.2157, 0.2235, 0.2275],
          [0.3490, 0.3490, 0.3608,  ..., 0.2235, 0.2275, 0.2235],
          [0.3412, 0.3333, 0.35

### Getting the Pretrained Model

In [28]:
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url

def get_state_dict(self, *args, **kwargs):
    kwargs.pop("check_hash")
    return load_state_dict_from_url(self.url, *args, **kwargs)
WeightsEnum.get_state_dict = get_state_dict

efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
model = efficientnet_b0(weights="DEFAULT")

### Freezing Model's Weights

In [None]:
# for param in model.parameters():
#     param.requires_grad = False

### Modifying the Final Fully Connected Layer

In [29]:
model.classifier

Sequential(
  (0): Dropout(p=0.2, inplace=True)
  (1): Linear(in_features=1280, out_features=1000, bias=True)
)

In [30]:
NUM_CLASSES = 6
model.classifier = nn.Linear(model.classifier[-1].in_features, NUM_CLASSES)
model.classifier

Linear(in_features=1280, out_features=6, bias=True)

In [None]:
# input_data = torch.randn(3, 224, 224).to(device)
# model = model.to(device)
# summary(model, input_data.shape[1:])

### Setting Up Loss Function and Optimizer

In [31]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

### Training Loop

In [33]:
EPOCHS = 10

In [34]:
class_index_mapping = {label['name']: label['id'] for label in labels}
class_index_mapping

{'hello': 1, 'yes': 2, 'no': 3, 'thanks': 4, 'i love you': 5}

In [35]:
train_dataset[0]

{'image': tensor([[[0.3020, 0.2824, 0.2902,  ..., 0.2353, 0.2471, 0.2510],
          [0.3098, 0.2941, 0.2980,  ..., 0.2235, 0.2353, 0.2431],
          [0.3137, 0.3059, 0.3098,  ..., 0.2275, 0.2431, 0.2510],
          ...,
          [0.7647, 0.7882, 0.8196,  ..., 0.1647, 0.1686, 0.1686],
          [0.7725, 0.7961, 0.8196,  ..., 0.1608, 0.1765, 0.1804],
          [0.7647, 0.7882, 0.8118,  ..., 0.1529, 0.1725, 0.1765]],
 
         [[0.4039, 0.3647, 0.3373,  ..., 0.2510, 0.2588, 0.2549],
          [0.3922, 0.3608, 0.3373,  ..., 0.2471, 0.2510, 0.2510],
          [0.3804, 0.3529, 0.3333,  ..., 0.2471, 0.2510, 0.2471],
          ...,
          [0.7647, 0.7922, 0.8275,  ..., 0.1647, 0.1608, 0.1608],
          [0.7843, 0.8039, 0.8392,  ..., 0.1608, 0.1608, 0.1569],
          [0.7961, 0.8196, 0.8471,  ..., 0.1529, 0.1490, 0.1373]],
 
         [[0.4235, 0.3961, 0.3843,  ..., 0.2745, 0.2745, 0.2706],
          [0.4392, 0.4157, 0.3961,  ..., 0.2784, 0.2784, 0.2745],
          [0.4235, 0.4078, 0.39

In [36]:
device_torch = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device_torch)
next(model.parameters()).is_cuda

False

In [37]:
for epoch in range(EPOCHS):
    running_loss = 0.0
    for idx in range(len(train_dataset)):
        sample = train_dataset[idx]
        x, y = sample['image'], sample['label']
        x = x.to(device)
        x = x.unsqueeze(0)
        # x = x.view(-1, 224, 224, 3)
        # x = x.permute(0, 3, 1, 2)
        # print(x.shape)

        # Forward pass
        y_pred = model(x)
        y_pred = y_pred.to(device)
        label = torch.tensor(class_index_mapping[y['class_name']], device=device)

        loss = loss_fn(y_pred, label.unsqueeze(0))
        # Zero the gradients
        optimizer.zero_grad()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {running_loss / len(train_dataset)}')

Epoch [1/10], Loss: 1.7683257964941173
Epoch [2/10], Loss: 1.604168334374061
Epoch [3/10], Loss: 1.4922878485459548
Epoch [4/10], Loss: 1.3488183470872732
Epoch [5/10], Loss: 1.1874702765391423
Epoch [6/10], Loss: 1.001362753372926
Epoch [7/10], Loss: 0.8920926997294792
Epoch [8/10], Loss: 0.6930442250691927
Epoch [9/10], Loss: 0.5241159890706723
Epoch [10/10], Loss: 0.4001792924908491


### Evaluating the Model

In [52]:
model = model.to(device)

model.eval()
with torch.no_grad():
    for epoch in range(EPOCHS):
        running_loss_test = 0.0
        for idx in range(len(test_dataset)):
            sample = test_dataset[idx]
            x, y = sample['image'], sample['label']

            # Forward pass
            y_pred = model(x.unsqueeze(0))
            # print(y_pred)
            #tensor([[-2.3828,  0.5092,  1.6678,  0.0214, -0.8732,  0.6362]],
            y_pred = y_pred.to(device)
            label = torch.tensor(class_index_mapping[y['class_name']])\
                    .to(device)
            loss = loss_fn(y_pred, label.unsqueeze(0))

            running_loss_test += loss.item()

        print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {running_loss_test / len(test_dataset)}')

Epoch [1/10], Loss: 0.8505194678902626
Epoch [2/10], Loss: 0.8505194678902626
Epoch [3/10], Loss: 0.8505194678902626
Epoch [4/10], Loss: 0.8505194678902626
Epoch [5/10], Loss: 0.8505194678902626
Epoch [6/10], Loss: 0.8505194678902626
Epoch [7/10], Loss: 0.8505194678902626
Epoch [8/10], Loss: 0.8505194678902626
Epoch [9/10], Loss: 0.8505194678902626
Epoch [10/10], Loss: 0.8505194678902626


### Saving the Weights of the Model

In [133]:
state = model.state_dict()
state['classifier.1.weight'] = state['classifier.weight']
del state['classifier.weight']
state['classifier.1.bias'] = state['classifier.bias']
del state['classifier.bias']

In [160]:
for key, value in state.items():
    print(key)

features.0.0.weight
features.0.1.weight
features.0.1.bias
features.0.1.running_mean
features.0.1.running_var
features.0.1.num_batches_tracked
features.1.0.block.0.0.weight
features.1.0.block.0.1.weight
features.1.0.block.0.1.bias
features.1.0.block.0.1.running_mean
features.1.0.block.0.1.running_var
features.1.0.block.0.1.num_batches_tracked
features.1.0.block.1.fc1.weight
features.1.0.block.1.fc1.bias
features.1.0.block.1.fc2.weight
features.1.0.block.1.fc2.bias
features.1.0.block.2.0.weight
features.1.0.block.2.1.weight
features.1.0.block.2.1.bias
features.1.0.block.2.1.running_mean
features.1.0.block.2.1.running_var
features.1.0.block.2.1.num_batches_tracked
features.2.0.block.0.0.weight
features.2.0.block.0.1.weight
features.2.0.block.0.1.bias
features.2.0.block.0.1.running_mean
features.2.0.block.0.1.running_var
features.2.0.block.0.1.num_batches_tracked
features.2.0.block.1.0.weight
features.2.0.block.1.1.weight
features.2.0.block.1.1.bias
features.2.0.block.1.1.running_mean
feat

In [53]:
from pathlib import Path

In [161]:
MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)
MODEL_NAME = 'model.pth'
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

print(f'Saving model to: {MODEL_SAVE_PATH}')
torch.save(obj=state, f=MODEL_SAVE_PATH)

Saving model to: models/model.pth


### Loading the Model

In [162]:
loaded_model = efficientnet_b0()

- Shape of original model's weight is ([1000, 1280]), need it to be ([6, 1280])
- Shape of original model's bias is ([1000]), need it to be ([6])

In [164]:
loaded_model.classifier

Sequential(
  (0): Dropout(p=0.2, inplace=True)
  (1): Linear(in_features=1280, out_features=1000, bias=True)
)

In [165]:
loaded_model.classifier = nn.Linear(loaded_model.classifier[-1].in_features, NUM_CLASSES)
loaded_model.classifier

Linear(in_features=1280, out_features=6, bias=True)

In [166]:
loaded_model.load_state_dict(torch.load(MODEL_SAVE_PATH), strict=False)
loaded_model.classifier

Linear(in_features=1280, out_features=6, bias=True)

In [39]:
### Convert XML Images to CSV

# def xml_to_csv(path):
#     xml_list = []
#     for xml_file in glob.glob(path + '/*.xml'):
#         tree = ET.parse(xml_file)
#         root = tree.getroot()
#         for child in root.findall('object'):
#             # print(child)
#             # Following the structure of the XML images at /images
#             value = (root.find('filename').text,
#                      int(root.find('size')[0].text),
#                      int(root.find('size')[0].text),
#                      child[0].text,
#                      int(child[4][0].text),
#                      int(child[4][1].text),
#                      int(child[4][2].text),
#                      int(child[4][3].text)
#                      )
#             #print(f'\nValues: \n{value}')
#             xml_list.append(value)

#     column_name = ['filename', 'width', 'height',
#                    'class', 'xmin', 'ymin', 'xmax', 'ymax']
#     xml_df = pd.DataFrame(xml_list, columns=column_name)
#     return xml_df