In [1]:
import cv2
import glob
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torch import nn, optim
import torch.nn.functional as F
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights, vgg11, VGG11_Weights
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url
from torchsummary import summary
from PIL import Image
from google.colab import drive

### Create a Label Map

In [2]:
labels = [
    {'name': 'hello', 'id': 1},
    {'name': 'yes', 'id': 2},
    {'name': 'no', 'id': 3},
    {'name': 'thanks', 'id': 4},
    {'name': 'i love you', 'id': 5},
]

In [3]:
!mkdir "/content/annotations/"

In [4]:
with open('/content/annotations/label_map.pbtxt', 'w') as f:
    for label in labels:
        f.write('item { \n')
        f.write(f'\tname:\'{label["name"]}\'\n')
        f.write(f'\tid:{label["id"]}\n')
        f.write('}\n')

### Connecting to Colab to get Images

In [5]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
!unzip "/content/gdrive/MyDrive/IA/SignLanguageImages.zip" -d "/content/images"

Archive:  /content/gdrive/MyDrive/IA/SignLanguageImages.zip
   creating: /content/images/test/
  inflating: /content/images/test/2024-01-22-122318_1.jpg  
  inflating: /content/images/test/2024-01-22-122318_1.xml  
  inflating: /content/images/test/2024-01-22-122318_2.jpg  
  inflating: /content/images/test/2024-01-22-122318_2.xml  
  inflating: /content/images/test/2024-01-22-122318_3.jpg  
  inflating: /content/images/test/2024-01-22-122318_3.xml  
  inflating: /content/images/test/2024-01-22-122318_4.jpg  
  inflating: /content/images/test/2024-01-22-122318_4.xml  
  inflating: /content/images/test/2024-01-22-122536_1.jpg  
  inflating: /content/images/test/2024-01-22-122536_1.xml  
  inflating: /content/images/test/2024-01-22-122536_2.jpg  
  inflating: /content/images/test/2024-01-22-122536_2.xml  
  inflating: /content/images/test/2024-01-22-122536_3.jpg  
  inflating: /content/images/test/2024-01-22-122536_3.xml  
  inflating: /content/images/test/2024-01-22-122536_4.jpg  
  inf

In [7]:
class SignDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None,
                 device='cpu'):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.device = device

        self.image_label_pairs = []
        split_dir = os.path.join(root_dir, split)

        for filename in os.listdir(split_dir):
            if filename.endswith('.jpg'):
                img_path = os.path.join(split_dir, filename)
                label_filename = os.path.splitext(filename)[0] + '.xml'
                label_path = os.path.join(split_dir, label_filename)
                if os.path.exists(label_path):
                    self.image_label_pairs.append((img_path, label_path))

    def __len__(self):
        return len(self.image_label_pairs)

    def __getitem__(self, idx):
        img_path, label_path = self.image_label_pairs[idx]

        def normalize_bbox(bbox, image_size):
            l = []
            for item in bbox:
                l.append((item / image_size) * 2 - 1)
            return l

        # Load Image
        img = Image.open(img_path).convert('RGB')

        # Convert to numpy array
        # img = np.array(img)

        # # Normalize image
        # img = img / 255.0

        # Load XML file and extract label information
        tree = ET.parse(label_path)
        root = tree.getroot()

        # Extract label information
        object_elem = root.find('object')
        class_name = object_elem.find('name').text
        bbox_elem = object_elem.find('bndbox')
        xmin = int(bbox_elem.find('xmin').text)
        ymin = int(bbox_elem.find('ymin').text)
        xmax = int(bbox_elem.find('xmax').text)
        ymax = int(bbox_elem.find('ymax').text)

        label = class_name
        bbox = [xmin, ymin, xmax, ymax]

        # Original image dimensions
        original_width, original_height = img.size

        # Apply transformations
        if self.transform:
            img = self.transform(img)

        # Resized image dimensions
        resized_width = img.shape[1]
        resized_height = img.shape[2]

        width_ratio = resized_width / original_width
        height_ratio = resized_height / original_height

        # Adjusting bbox coordinates to the new image shape
        adjusted_bbox_xmin = xmin * width_ratio
        adjusted_bbox_ymin = ymin * height_ratio
        adjusted_bbox_xmax = xmax * width_ratio
        adjusted_bbox_ymax = ymax * height_ratio

        bbox = [adjusted_bbox_xmin, adjusted_bbox_ymin,
                adjusted_bbox_xmax, adjusted_bbox_ymax]

        # Normalize bounding box coordinates
        image_size = img.shape[1]  # Get image size for normalization
        # print(image_size, bbox)
        bbox = normalize_bbox(bbox, image_size)
        # print(bbox)

        return img, label, bbox

### Data Preprocessing

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=(0, 30)),
    transforms.RandomPerspective(distortion_scale=0.3, p=1.0),
    transforms.ColorJitter(brightness=.5, hue=.3),
    transforms.ToTensor(),
])

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Custom Dataset

In [10]:
train_dataset = SignDataset(root_dir='/content/images', split='train',
                            transform=transform, device=device)
test_dataset = SignDataset(root_dir='/content/images', split='test',
                           transform=transform, device=device)

In [11]:
test_dataset[0]

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]),
 'no',
 [-0.928125, -0.55, -0.38750000000000007, 0.23750000000000004])

In [12]:
test_dataset[0][0]

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [13]:
len(train_dataset)

105

### Getting the Pretrained Model

In [36]:
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url

def get_state_dict(self, *args, **kwargs):
    kwargs.pop("check_hash")
    return load_state_dict_from_url(self.url, *args, **kwargs)
WeightsEnum.get_state_dict = get_state_dict

class CustomEfficientNetB0(nn.Module):
    def __init__(self, num_classes, num_coordinates=4):
        super(CustomEfficientNetB0, self).__init__()
        efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
        self.model = efficientnet_b0(weights="DEFAULT")
        in_features = self.model.classifier[-1].in_features

        # Altering last layer's output size
        self.model.classifier[1] = nn.Linear(1280, 1280, bias=True)

        # Freeze all layers
        for param in self.model.parameters():
            param.requires_grad = False

        self.regressor_head = nn.Sequential(
            nn.Linear(in_features, num_coordinates),
            nn.ReLU()
        )
        self.classifier_head = nn.Sequential(
            nn.Linear(in_features, num_classes),
            nn.Sigmoid()
        )

    def forward(self, x):
        y = self.model(x)
        # print(f'x: {x.shape}')
        # print(f'y: {y.shape}')

        class_logits = self.classifier_head(y)
        bbox_regression = self.regressor_head(y)
        return class_logits, bbox_regression

In [37]:
model = CustomEfficientNetB0(num_classes=6, num_coordinates=4)

In [38]:
model

CustomEfficientNetB0(
  (model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (

In [39]:
print(model.classifier_head)
print(model.regressor_head)

Sequential(
  (0): Linear(in_features=1280, out_features=6, bias=True)
  (1): Sigmoid()
)
Sequential(
  (0): Linear(in_features=1280, out_features=4, bias=True)
  (1): ReLU()
)


### Setting Up Loss Function and Optimizer

In [40]:
loss_fn_classification = nn.CrossEntropyLoss()
loss_fn_regression = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training Loop

In [41]:
EPOCHS = 10

In [42]:
class_index_mapping = {label['name']: label['id'] for label in labels}
class_index_mapping

{'hello': 1, 'yes': 2, 'no': 3, 'thanks': 4, 'i love you': 5}

In [43]:
train_dataset[0][1], train_dataset[0][2]

('thanks',
 [-0.4625, -0.11249999999999993, 0.7437499999999999, 0.8291666666666668])

In [44]:
device_torch = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device_torch)
next(model.parameters()).is_cuda

False

In [45]:
for idx in range(len(train_dataset)):
    sample = train_dataset[idx]
print(sample)

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]), 'yes', [-0.34687500000000004, -0.9958333333333333, 0.4375, -0.3375])


In [46]:
for epoch in range(EPOCHS):
    running_classification_loss = 0.0
    running_regression_loss = 0.0

    for idx in range(len(train_dataset)):
        sample = train_dataset[idx]
        # Getting the image (0), the label (1) and the bounding box (2)
        x, y, bbox = sample[0], sample[1], sample[2]
        x = x.to(device)
        x = x.unsqueeze(0)

        # Forward pass
        class_logits, bbox_regression = model(x)
        class_logits = class_logits.to(device)
        bbox_regression = bbox_regression.to(device)

        # Classification loss
        label = torch.tensor(class_index_mapping[y], device=device)
        classification_loss = loss_fn_classification(class_logits, label.unsqueeze(0))

        # Regression loss
        bbox_gt = torch.tensor(bbox, dtype=torch.float32, device=device)
        regression_loss = loss_fn_regression(bbox_regression, bbox_gt.unsqueeze(0))

        # Giving an increased weight to regression
        cls_weight = 0.5
        reg_weight = 1.0

        # Total loss
        total_loss = cls_weight * classification_loss + reg_weight * regression_loss

        # Zero the gradients
        optimizer.zero_grad()

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        running_classification_loss += classification_loss.item()
        running_regression_loss += regression_loss.item()

    print(f'Epoch [{epoch+1}/{EPOCHS}], '
          f'Classification Loss: {running_classification_loss / len(train_dataset)}, '
          f'Regression Loss: {running_regression_loss / len(train_dataset)}')

Epoch [1/10], Classification Loss: 1.7720766635168166, Regression Loss: 0.36473258407343
Epoch [2/10], Classification Loss: 1.7400719165802, Regression Loss: 0.3536955484322139
Epoch [3/10], Classification Loss: 1.7223409925188338, Regression Loss: 0.34906217462959743
Epoch [4/10], Classification Loss: 1.7080015579859416, Regression Loss: 0.34872622340917586
Epoch [5/10], Classification Loss: 1.697907040232704, Regression Loss: 0.34674246765318373
Epoch [6/10], Classification Loss: 1.6925312030883062, Regression Loss: 0.34334436576990857
Epoch [7/10], Classification Loss: 1.6902865909394764, Regression Loss: 0.34291196217139563
Epoch [8/10], Classification Loss: 1.6861977849687848, Regression Loss: 0.3467420072782607
Epoch [9/10], Classification Loss: 1.6848551954541888, Regression Loss: 0.3449827002627509
Epoch [10/10], Classification Loss: 1.68471113159543, Regression Loss: 0.3475455240834327


### Evaluating the Model

In [47]:
model = model.to(device)

model.eval()
with torch.no_grad():
    for epoch in range(EPOCHS):
        running_classification_loss = 0.0
        running_regression_loss = 0.0

        for idx in range(len(test_dataset)):
            sample = test_dataset[idx]
            # Getting the image (0), the label (1) and the bounding box (2)
            x, y, bbox = sample[0], sample[1], sample[2]
            x = x.to(device)
            x = x.unsqueeze(0)

            # Forward pass
            class_logits, bbox_regression = model(x)
            class_logits = class_logits.to(device)
            bbox_regression = bbox_regression.to(device)

            # Classification loss
            label = torch.tensor(class_index_mapping[y], device=device)
            classification_loss = loss_fn_classification(class_logits, label.unsqueeze(0))

            # Regression loss
            bbox_gt = torch.tensor(bbox, dtype=torch.float32, device=device)
            regression_loss = loss_fn_regression(bbox_regression, bbox_gt.unsqueeze(0))

            # Total loss
            total_loss = classification_loss + regression_loss

            running_classification_loss += classification_loss.item()
            running_regression_loss += regression_loss.item()

        print(f'Epoch [{epoch+1}/{EPOCHS}], '
            f'Classification Loss: {running_classification_loss / len(test_dataset)}, '
            f'Regression Loss: {running_regression_loss / len(test_dataset)}')

Epoch [1/10], Classification Loss: 1.6965954303741455, Regression Loss: 0.41345249662796657
Epoch [2/10], Classification Loss: 1.7003987669944762, Regression Loss: 0.43323517392079036
Epoch [3/10], Classification Loss: 1.6993077556292215, Regression Loss: 0.41030975927909213
Epoch [4/10], Classification Loss: 1.7031237403551738, Regression Loss: 0.4191895763079325
Epoch [5/10], Classification Loss: 1.6932000557581584, Regression Loss: 0.42276677836974463
Epoch [6/10], Classification Loss: 1.690177365144094, Regression Loss: 0.4331310580174128
Epoch [7/10], Classification Loss: 1.7012104630470275, Regression Loss: 0.43160213232040406
Epoch [8/10], Classification Loss: 1.7063844442367553, Regression Loss: 0.4360866367816925
Epoch [9/10], Classification Loss: 1.6943241953849792, Regression Loss: 0.42082010507583617
Epoch [10/10], Classification Loss: 1.7114178895950318, Regression Loss: 0.4406295508146286


### Saving the Weights of the Model

In [48]:
state = model.state_dict()
# state['classifier.1.weight'] = state['classifier.weight']
# del state['classifier.weight']
# state['classifier.1.bias'] = state['classifier.bias']
# del state['classifier.bias']

In [49]:
for key, value in state.items():
    print(key)

model.features.0.0.weight
model.features.0.1.weight
model.features.0.1.bias
model.features.0.1.running_mean
model.features.0.1.running_var
model.features.0.1.num_batches_tracked
model.features.1.0.block.0.0.weight
model.features.1.0.block.0.1.weight
model.features.1.0.block.0.1.bias
model.features.1.0.block.0.1.running_mean
model.features.1.0.block.0.1.running_var
model.features.1.0.block.0.1.num_batches_tracked
model.features.1.0.block.1.fc1.weight
model.features.1.0.block.1.fc1.bias
model.features.1.0.block.1.fc2.weight
model.features.1.0.block.1.fc2.bias
model.features.1.0.block.2.0.weight
model.features.1.0.block.2.1.weight
model.features.1.0.block.2.1.bias
model.features.1.0.block.2.1.running_mean
model.features.1.0.block.2.1.running_var
model.features.1.0.block.2.1.num_batches_tracked
model.features.2.0.block.0.0.weight
model.features.2.0.block.0.1.weight
model.features.2.0.block.0.1.bias
model.features.2.0.block.0.1.running_mean
model.features.2.0.block.0.1.running_var
model.fea

In [50]:
from pathlib import Path

In [51]:
MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)
MODEL_NAME = 'model.pth'
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

print(f'Saving model to: {MODEL_SAVE_PATH}')
torch.save(obj=state, f=MODEL_SAVE_PATH)

Saving model to: models/model.pth


### Loading the Model

In [52]:
NUM_CLASSES = 6

In [53]:
loaded_model = CustomEfficientNetB0(num_classes=NUM_CLASSES)

In [54]:
loaded_model.classifier_head, loaded_model.regressor_head

(Sequential(
   (0): Linear(in_features=1280, out_features=6, bias=True)
   (1): Sigmoid()
 ),
 Sequential(
   (0): Linear(in_features=1280, out_features=4, bias=True)
   (1): ReLU()
 ))

In [55]:
loaded_model.load_state_dict(torch.load(MODEL_SAVE_PATH), strict=False)
loaded_model

CustomEfficientNetB0(
  (model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (

def predict_on_webcam(model, labels, transform):
    cap = cv2.VideoCapture(0)  # Open the webcam

    while True:
        ret, frame = cap.read()

        # You may need to preprocess the frame, depending on the model's input requirements
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame).unsqueeze(0)

        with torch.no_grad():
            model.eval()
            output = model(frame_tensor)

        _, predicted_class = torch.max(output, 1)
        predicted_label = labels[predicted_class.item()]

        # Draw bounding box on the frame
        bbox = [100, 100, 300, 300]  # Replace this with your model's prediction
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        cv2.putText(frame, predicted_label, (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        cv2.imshow('Prediction', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [56]:
### Convert XML Images to CSV

# def xml_to_csv(path):
#     xml_list = []
#     for xml_file in glob.glob(path + '/*.xml'):
#         tree = ET.parse(xml_file)
#         root = tree.getroot()
#         for child in root.findall('object'):
#             # print(child)
#             # Following the structure of the XML images at /images
#             value = (root.find('filename').text,
#                      int(root.find('size')[0].text),
#                      int(root.find('size')[0].text),
#                      child[0].text,
#                      int(child[4][0].text),
#                      int(child[4][1].text),
#                      int(child[4][2].text),
#                      int(child[4][3].text)
#                      )
#             #print(f'\nValues: \n{value}')
#             xml_list.append(value)

#     column_name = ['filename', 'width', 'height',
#                    'class', 'xmin', 'ymin', 'xmax', 'ymax']
#     xml_df = pd.DataFrame(xml_list, columns=column_name)
#     return xml_df