## Packages

In [1]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
import torch
import torch.cuda
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from tqdm import tqdm
import torchvision.models as models
from torchvision.transforms import transforms as T
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Dataset Loading

In [3]:
path = '../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/annotations/manual/PHOENIX-2014-T.train-complex-annotation.corpus.csv'
dataframe = pd.read_csv(path, sep='|')

# Removing start and end columns
dataframe.drop(columns=['start', 'end'], inplace=True)

# Working on signer 1 alone
signer1_dataframe = dataframe[dataframe['speaker']=='Signer01']

signer1_dataframe.shape

(1862, 5)

### Analysing Dataset

In [4]:
if False:
    m = 0
    max_folder = ''
    list_folders = []
    list_len = []

    for folder in os.listdir('../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/'):
        l = len(os.listdir('../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/' + folder))
        list_folders.append(folder)
        list_len.append(l)

    temp_df = pd.DataFrame(list(zip(list_folders, list_len)), columns=['Folder', 'No. of images'])
    
    plt.hist(temp_df['No. of images'], bins=100)
    plt.show()
    

## Tokenizing Dataset

In [5]:
from torchnlp.encoders.text import StaticTokenizerEncoder, SpacyEncoder, pad_tensor
loaded_data = np.array(signer1_dataframe['orth'])
encoder = StaticTokenizerEncoder(loaded_data, tokenize=lambda s: s.split(), append_eos=True)

encoded_data = [encoder.encode(example) for example in loaded_data]
encoded_data = [pad_tensor(x, length=35) for x in encoded_data]

example_encode = encoder.encode(loaded_data[1])
example_pad = pad_tensor(example_encode, length=35)

print('actual: ', loaded_data[1])
print('encoded: ',example_pad)
print('decoded: ', encoder.decode(example_pad))

actual:  __ON__ ITALIEN IX TIEF DRUCK cl-KOMMEN HEUTE NACHT BERG SCHNEE REGEN REGEN __OFF__
encoded:  tensor([ 5, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 24, 14,  2,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
decoded:  __ON__ ITALIEN IX TIEF DRUCK cl-KOMMEN HEUTE NACHT BERG SCHNEE REGEN REGEN __OFF__ </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


## Image Transformations

In [6]:
normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor(),normalize])

### Filtering Sequences less than 50 images and more than 250 images

In [7]:
x = []
signer1_path = '../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/'
signer1_main = signer1_dataframe.copy(deep=True)
for folder in tqdm(signer1_main['name']):
    sequence_length = len(os.listdir(signer1_path + folder))
    
    if sequence_length > 250 or sequence_length < 50:
        signer1_dataframe = signer1_dataframe[signer1_dataframe['name']!=folder]

signer1_train, signer1_test = train_test_split(signer1_dataframe, test_size=0.3, random_state=42)
signer1_test, signer1_val = train_test_split(signer1_test, test_size=0.5, random_state=42)

signer1_train.shape, signer1_test.shape, signer1_val.shape

100%|██████████| 1862/1862 [00:00<00:00, 6047.08it/s]


((1148, 5), (246, 5), (247, 5))

In [8]:
class SLRT_Signer(Dataset):
    """SLRT dataset."""

    def __init__(self, data_frame, root_dir, transform, tokenizer):
        self.images_frame = data_frame['name']
        self.glosses = data_frame['orth']
        self.root_dir = root_dir
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.images_frame)

    def __getitem__(self, idx):
        global device
        if torch.is_tensor(idx):
            idx = idx.tolist()
        training_example = torch.zeros(250, 3, 224, 224)
        for files in os.listdir(os.path.join(self.root_dir, self.images_frame.iloc[idx])):
            img_name = self.root_dir + self.images_frame.iloc[idx] + '/' + files
            image = Image.open(img_name)
            image = self.transform(image)
            training_example[0:len(files), :] = image

        gloss = self.glosses.iloc[idx]
        encoded_gloss = self.tokenizer.encode(gloss)
        encoded_gloss = pad_tensor(encoded_gloss, 250)
        
        return training_example, encoded_gloss


In [9]:
signer1_train_dataset = SLRT_Signer(signer1_train,
                   root_dir='../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/',
                   transform=transform,
                   tokenizer=encoder
                   )
                  
signer1_test_dataset = SLRT_Signer(signer1_test,
                   root_dir='../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/',
                   transform=transform,
                   tokenizer=encoder
                   )

signer1_val_dataset = SLRT_Signer(signer1_val,
                   root_dir='../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/',
                   transform=transform,
                   tokenizer=encoder
                  )

In [10]:
params = {
    'batch_size': 8,
    'shuffle': True,
    'num_workers': 0
}
max_epochs = 100

In [11]:
train_gen = DataLoader(signer1_train_dataset, **params)
test_gen = DataLoader(signer1_test_dataset, **params)
val_gen = DataLoader(signer1_val_dataset, **params)

## Squeeznet Model

In [57]:
class SqueezeNet(nn.Module):
    
    def __init__(self, pre_train=False):
        super(SqueezeNet, self).__init__()
        
        self.model = torch.hub.load('pytorch/vision:v0.6.0', 'squeezenet1_1', pretrained=True)
        
        if pre_train == True:
            for param in self.model.parameters():
                param.requires_grad = False
            self.model.eval()
        children = [child for child in self.model.children()]
        for child in children:
            print('Name: 123',child)
        
        for child in children[0]:
            sub_children = [c for c in child.children()]
                print('Name ABCD', sub_child)
                for param in sub_child.parameters():
                    param.requires_grad = False
        
                
        self.fc1 = nn.Linear(1000, 512)
        
    def forward(self, src):
        
        output = self.model(src)
        output = self.fc1(output)
        output = F.relu(output)
        output = output.to(torch.device('cpu'))
        return output
        


In [58]:
### Testing Squeezenet

path=r'../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/01April_2010_Thursday_heute-6694/images0010.png'

squeeze_model = SqueezeNet().to(device)
summary(squeeze_model, (3, 224, 224))


Name: 123 Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2))
  (1): ReLU(inplace=True)
  (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (3): Fire(
    (squeeze): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
    (squeeze_activation): ReLU(inplace=True)
    (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
    (expand1x1_activation): ReLU(inplace=True)
    (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (expand3x3_activation): ReLU(inplace=True)
  )
  (4): Fire(
    (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
    (squeeze_activation): ReLU(inplace=True)
    (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
    (expand1x1_activation): ReLU(inplace=True)
    (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (expand3x3_activation): ReLU(inplace=True)
  )
  (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mo

Using cache found in /hdd/transformers/.cache/torch/hub/pytorch_vision_v0.6.0


In [54]:
# image = Image.open(path)
# image = transform(image)
# image = image.unsqueeze(0).to(device)


# output = squeeze_model(image)
# output.shape

## Transformer Model

In [15]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.tgt_mask = None
        self.nopeakmask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout).to(device)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        decoder_layers = TransformerDecoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        #self.encoder = nn.Embedding(ntoken, ninp)
        self.decoder_embedding = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
        #self.softmax = nn.Softmax(1)
        
        #self.init_weights()

    def _generate_square_subsequent_mask(self, src, trt, sz):
        mask = (torch.triu(torch.ones(250, 250)) == 1).transpose(0, 1).half().to(device)
        nopeakmask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).half().to(device)
        zeros = torch.zeros(self.ninp).half().to(device)
        src_msk = (src == zeros).half().to(device)
        target_msk = (trt == 0).unsqueeze(0).half().to(device)
        return src_msk, target_msk, nopeakmask

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, trt):
#         trt = trt.to(torch.device('cuda'))
        trt = trt.to(src.device)
        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
            device = src.device
            src_mask, tgt_mask, self.nopeakmask = self._generate_square_subsequent_mask(src, trt, src.size(1))
            self.src_mask = src_mask
            #self.tgt_mask = tgt_mask
        #src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
#         print ("source",src.shape)
        src = src.permute(1,0,2)
#         print ("source",src.shape)
        output = self.transformer_encoder(src)
#         trt = trt.to(torch.cuda.device('cpu'))
#         trt = torch.cuda.LongTensor(trt)
        trt = trt.type(torch.cuda.LongTensor)
        trgt = self.decoder_embedding(trt)
        trgt = trgt.permute(1,0,2)
#         print (trgt.shape), print (output.shape)
        output = self.transformer_decoder(trgt, output, tgt_mask = self.nopeakmask) #tgt_key_padding_mask = tgt_mask)
        output = self.decoder(output)
#         print (output.shape)  shape: 250, batchsize, 719
        output = output.permute(1,2,0)
        #output = output.reshape(-1, output.shape[2])
#         output = self.softmax(output)
        return output


In [16]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [17]:
ntokens = len(encoder.vocab) # the size of vocabulary
emsize = 512 # embedding dimension
nhid = 1024 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.3 # the dropout value
src_pad_index = 0

In [18]:
class Sign2Gloss_Model(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super (Sign2Gloss_Model, self).__init__()
        self.TransformerModel = TransformerModel(ntokens, ninp, nhead, nhid, nlayers, \
                                                 dropout).half()
        self.SqueezeNet = SqueezeNet(pre_train=False).half().to(device)
        self.softmax = nn.Softmax(2)
        self.ninp = ninp
    
    def forward(self, src, trt):
        batch_vector = torch.Tensor(src.shape[0], src.shape[1], self.ninp)
        batch_batch_size = 4
        for i in range (src.shape[0]):
            for batch in range (0, 250, batch_batch_size):
                inp = src[i, batch:batch+batch_batch_size].half().to(device)
                transformer_source = self.SqueezeNet(inp)
                batch_vector[i, batch:batch+batch_batch_size]=transformer_source
                del transformer_source, inp
                gc.collect
#             batch_vector[i] = transformer_source 
        batch_vector = batch_vector.half().to(device)
        self.TransformerModel = self.TransformerModel.to(device)
        output = self.TransformerModel(batch_vector, trt)
        output = self.softmax(output)
        del batch_vector
        
        return output

In [19]:
# eval_model  = TransformerModel(ntoken=ntokens, ninp=emsize, nhid=nhid, nhead=nhead, nlayers=nlayers, dropout=dropout).to(device)

# summary(eval_model, [(-1,250, 512), (8, 250,)])

### Training Loop

In [20]:
#model training
model = Sign2Gloss_Model(ntoken = ntokens, ninp = emsize, nhid = nhid, nlayers = nlayers, nhead = nhead, dropout = dropout)
criterion = nn.CrossEntropyLoss(ignore_index = src_pad_index)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
 # enumerate epochs
for epoch in tqdm(range(10)):
# enumerate mini batches
    epoch_loss = 0
#     print ("Epoch: ", epoch, "in progress...")
    for i, (inputs, targets) in enumerate(train_gen):   
        inputs, targets = inputs, targets
        # clear the gradients
        optimizer.zero_grad()
        # compute the model output
        yhat = model(inputs, targets)
        
        yhat = yhat.to(torch.device('cpu'))
        yhat = yhat.type(torch.Tensor)
        targets = targets.type(torch.LongTensor)
        # calculate loss
        loss = criterion(yhat, targets)
        # credit assignment
        loss.backward()
        # update model weights
        optimizer.step()
        epoch_loss += loss.item()
        torch.cuda.empty_cache()
        del inputs, targets
        gc.collect()
    
    torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': 100
    }, 'squeezenetmode2.pt')
    
    try:
        with open('SqueezeNet.txt', 'at') as file:
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            file.write("Epoch {}, Logs:{}, Time: {}".format(epoch, epoch_loss, current_time))
    except:
        pass

    print ("Epoch {} : {}".format(epoch, epoch_loss))

Using cache found in /hdd/transformers/.cache/torch/hub/pytorch_vision_v0.6.0
  0%|          | 0/10 [00:06<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.79 GiB total capacity; 6.62 GiB already allocated; 13.12 MiB free; 34.70 MiB cached)

In [None]:
import torch
import gc
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass


In [None]:
torch.save({
            'epoch': 10,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': 100
            }, 'squeezenetmodel-2_backup.pt')

## Validation

In [None]:
# eval_model = Sign2Gloss_Model(ntoken = ntokens, ninp = emsize, nhid = nhid, nlayers = nlayers, nhead = nhead, dropout = dropout)
# criterion = nn.CrossEntropyLoss(ignore_index = src_pad_index)
# eval_optimizer = torch.optim.SGD(eval_model.parameters(), lr=0.01, momentum=0.9)

In [None]:
# checkpoint = torch.load('squeezenetmodel-1.pt')
# eval_model.load_state_dict(checkpoint['model_state_dict'])
# eval_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# eval_model.eval()
# pass

In [None]:
# epoch_loss = 0.0
# with torch.no_grad():
#     for i, (inputs, targets) in enumerate(test_gen):   
#         try:
#             inputs, targets = inputs, targets.to(device)
#             # compute the model output
#             yhat = eval_model(inputs, targets)
#             # calculate loss
#             loss = criterion(yhat, targets)
#             # credit assignment
#             # update model weights
#             epoch_loss += loss.item()
#             torch.cuda.empty_cache()

#             print('Actual: ', encoder.decode(targets[0]))
#             print(yhat.shape)
#             yhat = yhat.permute(0, 2, 1)
#             print(yhat.shape)
#             yhat_pred = torch.argmax(yhat, dim=2)
#             print('Predicted: ', encoder.decode(yhat_pred[0]))
#             print()
#         except Exception as e:
#             print(e)

#         torch.cuda.empty_cache()
#         del inputs, targets, yhat, yhat_pred
#         gc.collect()

