In [25]:
import cv2
import glob
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torch import nn, optim
import torch.nn.functional as F
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights, vgg11, VGG11_Weights
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url
from torchsummary import summary
from PIL import Image
from google.colab import drive

### Create a Label Map

In [2]:
labels = [
    {'name': 'hello', 'id': 1},
    {'name': 'yes', 'id': 2},
    {'name': 'no', 'id': 3},
    {'name': 'thanks', 'id': 4},
    {'name': 'i love you', 'id': 5},
]

In [3]:
!mkdir "/content/annotations/"

In [4]:
with open('/content/annotations/label_map.pbtxt', 'w') as f:
    for label in labels:
        f.write('item { \n')
        f.write(f'\tname:\'{label["name"]}\'\n')
        f.write(f'\tid:{label["id"]}\n')
        f.write('}\n')

### Connecting to Colab to get Images

In [5]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!unzip "/content/gdrive/MyDrive/IA/signLanguageImages.zip" -d "/content/images"

In [7]:
class SignDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None,
                 device='cpu'):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.device = device

        self.image_label_pairs = []
        split_dir = os.path.join(root_dir, split)

        for filename in os.listdir(split_dir):
            if filename.endswith('.jpg'):
                img_path = os.path.join(split_dir, filename)
                label_filename = os.path.splitext(filename)[0] + '.xml'
                label_path = os.path.join(split_dir, label_filename)
                if os.path.exists(label_path):
                    self.image_label_pairs.append((img_path, label_path))

    def __len__(self):
        return len(self.image_label_pairs)

    def __getitem__(self, idx):
        img_path, label_path = self.image_label_pairs[idx]

        # Load Image
        img = Image.open(img_path).convert('RGB')

        # Load XML file and extract label information
        tree = ET.parse(label_path)
        root = tree.getroot()

        # Extract label information
        object_elem = root.find('object')
        class_name = object_elem.find('name').text
        bbox_elem = object_elem.find('bndbox')
        xmin = int(bbox_elem.find('xmin').text)
        ymin = int(bbox_elem.find('ymin').text)
        xmax = int(bbox_elem.find('xmax').text)
        ymax = int(bbox_elem.find('ymax').text)

        label = class_name
        bbox = [xmin, ymin, xmax, ymax]

        # Apply transformations
        if self.transform:
            img = self.transform(img)

        return img, label, bbox

### Data Preprocessing

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Custom Dataset

In [10]:
train_dataset = SignDataset(root_dir='/content/images', split='train',
                            transform=transform, device=device)
test_dataset = SignDataset(root_dir='/content/images', split='test',
                           transform=transform, device=device)

In [11]:
test_dataset[0]

(tensor([[[0.2784, 0.2784, 0.2824,  ..., 0.2275, 0.2431, 0.2353],
          [0.2549, 0.2667, 0.2824,  ..., 0.2275, 0.2314, 0.2314],
          [0.2510, 0.2627, 0.2784,  ..., 0.2196, 0.2275, 0.2314],
          ...,
          [0.0980, 0.1059, 0.1098,  ..., 0.0941, 0.0902, 0.0980],
          [0.0863, 0.1098, 0.1176,  ..., 0.1020, 0.0941, 0.0980],
          [0.1020, 0.1216, 0.1216,  ..., 0.0941, 0.0980, 0.0980]],
 
         [[0.3490, 0.3294, 0.3098,  ..., 0.2353, 0.2431, 0.2314],
          [0.3451, 0.3255, 0.3216,  ..., 0.2314, 0.2314, 0.2275],
          [0.3451, 0.3294, 0.3216,  ..., 0.2196, 0.2275, 0.2314],
          ...,
          [0.1059, 0.1137, 0.1059,  ..., 0.0824, 0.0941, 0.1098],
          [0.0980, 0.1098, 0.1059,  ..., 0.0980, 0.0941, 0.1059],
          [0.1020, 0.1137, 0.1059,  ..., 0.0902, 0.0941, 0.0980]],
 
         [[0.4000, 0.3686, 0.3765,  ..., 0.2314, 0.2510, 0.2471],
          [0.3608, 0.3412, 0.3569,  ..., 0.2314, 0.2353, 0.2353],
          [0.3412, 0.3255, 0.3216,  ...,

In [23]:
test_dataset[0][0].shape

torch.Size([3, 224, 224])

In [24]:
len(train_dataset)

65

### Getting the Pretrained Model

In [141]:
from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url

def get_state_dict(self, *args, **kwargs):
    kwargs.pop("check_hash")
    return load_state_dict_from_url(self.url, *args, **kwargs)
WeightsEnum.get_state_dict = get_state_dict

class CustomEfficientNetB0(nn.Module):
    def __init__(self, num_classes, num_coordinates=4):
        super(CustomEfficientNetB0, self).__init__()
        efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
        self.model = efficientnet_b0(weights="DEFAULT")
        in_features = self.model.classifier[-1].in_features

        # Altering last layer's output size
        self.model.classifier[1] = nn.Linear(1280, 1280, bias=True)

        self.classifier_head = nn.Linear(in_features, num_classes)
        self.regressor_head = nn.Linear(in_features, num_coordinates)

    def forward(self, x):
        y = self.model(x)
        # print(f'x: {x.shape}')
        # print(f'y: {y.shape}')

        class_logits = self.classifier_head(y)
        bbox_regression = self.regressor_head(y)
        return class_logits, bbox_regression

In [142]:
model = CustomEfficientNetB0(num_classes=6, num_coordinates=4)

In [143]:
model

CustomEfficientNetB0(
  (model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (

In [128]:
print(model.classifier_head)
print(model.regressor_head)

Linear(in_features=1280, out_features=6, bias=True)
Linear(in_features=1280, out_features=4, bias=True)


### Setting Up Loss Function and Optimizer

In [76]:
loss_fn_classification = nn.CrossEntropyLoss()
loss_fn_regression = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training Loop

In [16]:
EPOCHS = 10

In [17]:
class_index_mapping = {label['name']: label['id'] for label in labels}
class_index_mapping

{'hello': 1, 'yes': 2, 'no': 3, 'thanks': 4, 'i love you': 5}

In [30]:
train_dataset[0][1], train_dataset[0][2]

('hello', [347, 149, 553, 405])

In [19]:
device_torch = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device_torch)
next(model.parameters()).is_cuda

False

In [31]:
for idx in range(len(train_dataset)):
    sample = train_dataset[idx]
print(sample)

(tensor([[[0.2902, 0.2863, 0.2275,  ..., 0.1804, 0.1882, 0.1922],
         [0.2941, 0.2706, 0.2314,  ..., 0.1922, 0.1961, 0.2000],
         [0.2941, 0.2510, 0.2392,  ..., 0.2078, 0.2078, 0.2196],
         ...,
         [0.0941, 0.1059, 0.1098,  ..., 0.0902, 0.0980, 0.1059],
         [0.1020, 0.0980, 0.0941,  ..., 0.0902, 0.1020, 0.1020],
         [0.1059, 0.0941, 0.0980,  ..., 0.0980, 0.0980, 0.0941]],

        [[0.3333, 0.3176, 0.3176,  ..., 0.2431, 0.2392, 0.2235],
         [0.3333, 0.3176, 0.3137,  ..., 0.2392, 0.2392, 0.2196],
         [0.3373, 0.3137, 0.3059,  ..., 0.2314, 0.2275, 0.2275],
         ...,
         [0.0863, 0.1059, 0.1059,  ..., 0.1098, 0.1137, 0.1137],
         [0.1020, 0.1098, 0.1059,  ..., 0.0941, 0.1020, 0.0980],
         [0.1176, 0.1137, 0.1176,  ..., 0.0941, 0.0980, 0.0863]],

        [[0.4157, 0.3725, 0.3569,  ..., 0.2314, 0.2314, 0.2431],
         [0.3922, 0.3686, 0.3490,  ..., 0.2431, 0.2431, 0.2510],
         [0.3765, 0.3608, 0.3529,  ..., 0.2588, 0.2510, 0

In [144]:
for epoch in range(EPOCHS):
    running_classification_loss = 0.0
    running_regression_loss = 0.0

    for idx in range(len(train_dataset)):
        sample = train_dataset[idx]
        # Getting the image (0), the label (1) and the bounding box (2)
        x, y, bbox = sample[0], sample[1], sample[2]
        x = x.to(device)
        x = x.unsqueeze(0)

        # Forward pass
        class_logits, bbox_regression = model(x)
        class_logits = class_logits.to(device)
        bbox_regression = bbox_regression.to(device)

        # Classification loss
        label = torch.tensor(class_index_mapping[y], device=device)
        classification_loss = loss_fn_classification(class_logits, label.unsqueeze(0))

        # Regression loss
        bbox_gt = torch.tensor(bbox, dtype=torch.float32, device=device)
        regression_loss = loss_fn_regression(bbox_regression, bbox_gt.unsqueeze(0))

        # Total loss
        total_loss = classification_loss + regression_loss

        # Zero the gradients
        optimizer.zero_grad()

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        running_classification_loss += classification_loss.item()
        running_regression_loss += regression_loss.item()

    print(f'Epoch [{epoch+1}/{EPOCHS}], '
          f'Classification Loss: {running_classification_loss / len(train_dataset)}, '
          f'Regression Loss: {running_regression_loss / len(train_dataset)}')

Epoch [1/10], Classification Loss: 1.8047388737018293, Regression Loss: 292.68226764385514
Epoch [2/10], Classification Loss: 1.8037292938966019, Regression Loss: 292.68321533203124
Epoch [3/10], Classification Loss: 1.7931077021818895, Regression Loss: 292.68114084097056
Epoch [4/10], Classification Loss: 1.7982135240848247, Regression Loss: 292.6808086688702
Epoch [5/10], Classification Loss: 1.7940700604365423, Regression Loss: 292.6849308894231
Epoch [6/10], Classification Loss: 1.7977200728196365, Regression Loss: 292.6791687011719
Epoch [7/10], Classification Loss: 1.7976170283097488, Regression Loss: 292.67867595966044
Epoch [8/10], Classification Loss: 1.808316837824308, Regression Loss: 292.6788862961989
Epoch [9/10], Classification Loss: 1.8022050619125367, Regression Loss: 292.68149907038764
Epoch [10/10], Classification Loss: 1.796836024064284, Regression Loss: 292.6804692195012


### Evaluating the Model

In [148]:
model = model.to(device)

model.eval()
with torch.no_grad():
    for epoch in range(EPOCHS):
        running_classification_loss = 0.0
        running_regression_loss = 0.0

        for idx in range(len(test_dataset)):
            sample = test_dataset[idx]
            # Getting the image (0), the label (1) and the bounding box (2)
            x, y, bbox = sample[0], sample[1], sample[2]
            x = x.to(device)
            x = x.unsqueeze(0)

            # Forward pass
            class_logits, bbox_regression = model(x)
            class_logits = class_logits.to(device)
            bbox_regression = bbox_regression.to(device)

            # Classification loss
            label = torch.tensor(class_index_mapping[y], device=device)
            classification_loss = loss_fn_classification(class_logits, label.unsqueeze(0))

            # Regression loss
            bbox_gt = torch.tensor(bbox, dtype=torch.float32, device=device)
            regression_loss = loss_fn_regression(bbox_regression, bbox_gt.unsqueeze(0))

            # Total loss
            total_loss = classification_loss + regression_loss

            running_classification_loss += classification_loss.item()
            running_regression_loss += regression_loss.item()

        print(f'Epoch [{epoch+1}/{EPOCHS}], '
            f'Classification Loss: {running_classification_loss / len(test_dataset)}, '
            f'Regression Loss: {running_regression_loss / len(test_dataset)}')

Epoch [1/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [2/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [3/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [4/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [5/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [6/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [7/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [8/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [9/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094
Epoch [10/10], Classification Loss: 1.7800987601280212, Regression Loss: 293.6971130371094


### Saving the Weights of the Model

In [160]:
state = model.state_dict()
# state['classifier.1.weight'] = state['classifier.weight']
# del state['classifier.weight']
# state['classifier.1.bias'] = state['classifier.bias']
# del state['classifier.bias']

In [151]:
for key, value in state.items():
    print(key)

model.features.0.0.weight
model.features.0.1.weight
model.features.0.1.bias
model.features.0.1.running_mean
model.features.0.1.running_var
model.features.0.1.num_batches_tracked
model.features.1.0.block.0.0.weight
model.features.1.0.block.0.1.weight
model.features.1.0.block.0.1.bias
model.features.1.0.block.0.1.running_mean
model.features.1.0.block.0.1.running_var
model.features.1.0.block.0.1.num_batches_tracked
model.features.1.0.block.1.fc1.weight
model.features.1.0.block.1.fc1.bias
model.features.1.0.block.1.fc2.weight
model.features.1.0.block.1.fc2.bias
model.features.1.0.block.2.0.weight
model.features.1.0.block.2.1.weight
model.features.1.0.block.2.1.bias
model.features.1.0.block.2.1.running_mean
model.features.1.0.block.2.1.running_var
model.features.1.0.block.2.1.num_batches_tracked
model.features.2.0.block.0.0.weight
model.features.2.0.block.0.1.weight
model.features.2.0.block.0.1.bias
model.features.2.0.block.0.1.running_mean
model.features.2.0.block.0.1.running_var
model.fea

In [152]:
from pathlib import Path

In [153]:
MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)
MODEL_NAME = 'model.pth'
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

print(f'Saving model to: {MODEL_SAVE_PATH}')
torch.save(obj=state, f=MODEL_SAVE_PATH)

Saving model to: models/model.pth


### Loading the Model

In [161]:
NUM_CLASSES = 6

In [164]:
loaded_model = CustomEfficientNetB0(num_classes=NUM_CLASSES)

In [165]:
loaded_model.classifier_head, loaded_model.regressor_head

(Linear(in_features=1280, out_features=6, bias=True),
 Linear(in_features=1280, out_features=4, bias=True))

In [163]:
loaded_model.load_state_dict(torch.load(MODEL_SAVE_PATH), strict=False)
loaded_model

CustomEfficientNetB0(
  (model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (

def predict_on_webcam(model, labels, transform):
    cap = cv2.VideoCapture(0)  # Open the webcam

    while True:
        ret, frame = cap.read()

        # You may need to preprocess the frame, depending on the model's input requirements
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame).unsqueeze(0)

        with torch.no_grad():
            model.eval()
            output = model(frame_tensor)

        _, predicted_class = torch.max(output, 1)
        predicted_label = labels[predicted_class.item()]

        # Draw bounding box on the frame
        bbox = [100, 100, 300, 300]  # Replace this with your model's prediction
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        cv2.putText(frame, predicted_label, (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        cv2.imshow('Prediction', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
### Convert XML Images to CSV

# def xml_to_csv(path):
#     xml_list = []
#     for xml_file in glob.glob(path + '/*.xml'):
#         tree = ET.parse(xml_file)
#         root = tree.getroot()
#         for child in root.findall('object'):
#             # print(child)
#             # Following the structure of the XML images at /images
#             value = (root.find('filename').text,
#                      int(root.find('size')[0].text),
#                      int(root.find('size')[0].text),
#                      child[0].text,
#                      int(child[4][0].text),
#                      int(child[4][1].text),
#                      int(child[4][2].text),
#                      int(child[4][3].text)
#                      )
#             #print(f'\nValues: \n{value}')
#             xml_list.append(value)

#     column_name = ['filename', 'width', 'height',
#                    'class', 'xmin', 'ymin', 'xmax', 'ymax']
#     xml_df = pd.DataFrame(xml_list, columns=column_name)
#     return xml_df