# Imports

In [9]:
import numpy as np
import torch
from torch import nn
import pandas as pd
import pickle

import torchvision
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import os

# CUDA cores goes brrrrr

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

# Loading data

## Loading the videos and annotations

In [None]:
letters = ["A", "B", "C", "L", "R", "U"]
data = []
removed_videos = {}

frames_per_video = 10

for letter in letters:
    
    removed_videos[letter] = []
    anno_df = pd.read_csv("data/ASL_letter_" + letter + "/annotations.csv")
    nr = 0
    while True:
            
        try:
            
            video = torchvision.io.read_video("data/ASL_letter_" + letter + "/videos/video_" + str(nr) + ".mp4")
            if video[0].shape[1] == 480 and video[0].shape[2] == 640:
                for i in range(frames_per_video):
                    frame = np.random.randint(0, video[0].shape[0])
                    
                    frame_df = anno_df.loc[np.logical_and(anno_df['video_idx'] == nr, anno_df['frame'] == frame)]
                    x = torch.tensor(frame_df['x'].values)
                    y = torch.tensor(frame_df['y'].values)

                    annotations = np.array([])
                    for i in range(x.shape[0]):
                        annotations = np.append(annotations, [x[i], y[i]])
                        
                    image = torch.movedim(video[0][frame, :, :, :].float(), (0,1,2), (2,1,0))
                    data.append((image, torch.tensor(annotations, dtype=torch.float32)))
            else:
                removed_videos[letter].append(nr)
                
            nr += 1
            
        except RuntimeError:
            break

print("Success!")

## Splitting the data

In [None]:
split = 0.8
training_data = []
validation_data = []

total_videos = 0
for letter in letters:
    _, _, files = next(os.walk("data/ASL_letter_" + letter + "/videos"))
    file_count = len(files) - len(removed_videos[letter])
    training_data = training_data + data[total_videos*frames_per_video:total_videos*frames_per_video + frames_per_video*int(split*file_count)]
    validation_data = validation_data + data[total_videos*frames_per_video + frames_per_video*int(split*file_count):total_videos*frames_per_video + frames_per_video*file_count]
    total_videos += file_count

len(training_data), len(validation_data)

In [None]:
file = open('training_data', 'wb')
pickle.dump(training_data, file)
file.close()

file = open('validation_data', 'wb')
pickle.dump(validation_data, file)
file.close()

In [11]:
file = open('training_data', 'rb')
training_data = pickle.load(file)
file.close()

file = open('validation_data', 'rb')
validation_data = pickle.load(file)
file.close()

## Creating Dataset

In [12]:
class GesturesDataset(torch.utils.data.Dataset):
    
    def __init__(self, d):
        self.data = d
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        datapoint = self.data[idx]
        return datapoint[0], datapoint[1]

## Initializing DataLoader

In [13]:
batch_size = 200

training_set = GesturesDataset(training_data)
validation_set = GesturesDataset(validation_data)

train_dataloader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_set, batch_size=100, shuffle=True)

# Creating the model

## The model

In [14]:
class SimpleCNN(nn.Module):
    def __init__(self):  
        super().__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 6, 5),
            nn.ReLU(), 
            nn.MaxPool2d(2, 2),
            nn.Conv2d(6, 1, 5),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(18369, 1000),
            nn.ReLU(),
            nn.Linear(1000, 1000),
            nn.ReLU(),
            nn.Linear(1000, 42)
        )
        
    def forward(self, x):
        return self.cnn(x)
    
model = SimpleCNN().to(device)
model

SimpleCNN(
  (cnn): Sequential(
    (0): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 1, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=18369, out_features=1000, bias=True)
    (8): ReLU()
    (9): Linear(in_features=1000, out_features=1000, bias=True)
    (10): ReLU()
    (11): Linear(in_features=1000, out_features=42, bias=True)
  )
)

In [None]:
class UNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.maxpool = nn.MaxPool2d(2,2)
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear')
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        
        self.group1layer1 = nn.Conv2d(3,3,3, padding=1, padding_mode='reflect')
        #self.group1layer2 = nn.Conv2d(3,3,3, padding=1, padding_mode='reflect')
        
        self.group2layer1 = nn.Conv2d(3,6,3, padding=1, padding_mode='reflect')
        #self.group2layer2 = nn.Conv2d(6,6,3, padding=1, padding_mode='reflect')
        
        self.group3layer1 = nn.Conv2d(6,12,3, padding=1, padding_mode='reflect')
        #self.group3layer2 = nn.Conv2d(12,12,3, padding=1, padding_mode='reflect')
        
        self.group4layer1 = nn.Conv2d(12,24,3, padding=1, padding_mode='reflect')
        #self.group4layer2 = nn.Conv2d(24,24,3, padding=1, padding_mode='reflect')
        
        self.group5layer1 = nn.Conv2d(24,48,3, padding=1, padding_mode='reflect')
        #self.group5layer2 = nn.Conv2d(48,48,3, padding=1, padding_mode='reflect')
        
        self.midlayer1 = nn.Conv2d(48,48,2, padding=1, padding_mode='reflect')
        
        self.group6layer1 = nn.Conv2d(72,24,3, padding=1, padding_mode='reflect')
        #self.group6layer2 = nn.Conv2d(24,24,3, padding=1, padding_mode='reflect')
        
        self.midlayer2 = nn.Conv2d(24,24,2, padding=1, padding_mode='reflect')
        
        self.group7layer1 = nn.Conv2d(36,12,3, padding=1, padding_mode='reflect')
        #self.group7layer2 = nn.Conv2d(12,12,3, padding=1, padding_mode='reflect')
        
        self.midlayer3 = nn.Conv2d(12,12,2, padding=1, padding_mode='reflect')
        
        self.group8layer1 = nn.Conv2d(18,6,3, padding=1, padding_mode='reflect')
        #self.group8layer2 = nn.Conv2d(6,6,3, padding=1, padding_mode='reflect')
        
        self.midlayer4 = nn.Conv2d(6,6,2, padding=1, padding_mode='reflect')
        
        self.group9layer1 = nn.Conv2d(9,1,3, padding=1, padding_mode='reflect')
       # self.group9layer2 = nn.Conv2d(3,3,3, padding=1, padding_mode='reflect')
        
        self.ann_layer1 = nn.Linear(307200, 1000)
        self.ann_layer2 = nn.Linear(1000, 500)
        self.ann_layer3 = nn.Linear(500, 42)
        
        
    def forward(self, x):
        x1 = self.relu(self.group1layer1(x))
        x2 = self.relu(self.group2layer1(self.maxpool(x1)))
        x3 = self.relu(self.group3layer1(self.maxpool(x2)))
        x4 = self.relu(self.group4layer1(self.maxpool(x3)))
        x5 = self.relu(self.group5layer1(self.maxpool(x4)))
        
        y1 = self.relu(self.group6layer1(torch.cat((x4,self.midlayer1(self.upsample(x5))[:,:,:-1,:-1]), dim=1)))
        y2 = self.relu(self.group7layer1(torch.cat((x3,self.midlayer2(self.upsample(y1))[:,:,:-1,:-1]), dim=1)))
        y3 = self.relu(self.group8layer1(torch.cat((x2,self.midlayer3(self.upsample(y2))[:,:,:-1,:-1]), dim=1)))
        y4 = self.relu(self.group9layer1(torch.cat((x1,self.midlayer4(self.upsample(y3))[:,:,:-1,:-1]), dim=1)))
        
        return self.ann_layer3(self.relu(self.ann_layer2(self.relu(self.ann_layer1(self.flatten(y4))))))

model = UNet().to(device)

In [15]:
# Assuming (1,3,640,480) tensor
def L2Loss(Y, Y_pred):
    return torch.sum(torch.pow(torch.sub(Y, Y_pred), 2))

loss_fn = L2Loss

# Training

In [None]:
optim = torch.optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 100

losses = np.array([])

print("Starting training...")

model.train()  # set model in training mode

# loop over the dataset multiple times, similar to our "steps" used before
for epoch in range(n_epochs):
    for i, (images, labels) in enumerate(train_dataloader):
        
        outputs = model(images.to(device))
        loss = loss_fn(outputs, labels.to(device))
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        losses = np.append(losses, loss.item())
        print(f'Epoch: {epoch+1} \t Batch: {i+1} \t Loss: {loss.item()}')

print('Finished Training')

In [None]:
plt.plot(losses)
plt.show()

In [None]:
# save the model
torch.save(model.state_dict(), "first_good_simplecnn.pth")

In [16]:
model = SimpleCNN().to(device)
model.load_state_dict(torch.load('first_good_simplecnn.pth'))

<All keys matched successfully>

# Validation

In [17]:
correct = np.array([])
for i, (images, labels) in enumerate(validation_dataloader):
    
    outputs = model(images.to(device))
    with torch.no_grad():
        for j in range(outputs.shape[0]):
            if loss_fn(outputs[j,:], labels[j,:].to(device)) < 15000:
                correct = np.append(correct, 1)
            else:
                correct = np.append(correct, 0)

print("Correct percent: ", np.mean(correct))

Correct percent:  0.5142857142857142


In [18]:
transform = torchvision.transforms.ToPILImage()

In [None]:
image, annotations = validation_set.__getitem__(np.random.randint(0,len(validation_set)))
prediction = model(image[None,:,:,:].to(device))
loss = loss_fn(prediction, annotations.to(device))
print(loss.item())
img = transform(image)
drawing = ImageDraw.Draw(img)
ellips_coords = []
for i in range(0,42,2):
    drawing.ellipse([(annotations[i] - 2, annotations[i+1] - 2), (annotations[i] + 2, annotations[i+1] + 2)], outline=(0,255,0))
    drawing.ellipse([(prediction[0,i] - 2, prediction[0,i+1] - 2), (prediction[0,i] + 2, prediction[0,i+1] + 2)], outline=(0,0,255))

img.show()