In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Packages

In [0]:
import random
import sys
import torch.nn as nn 
import numpy as np
import cv2
from glob import glob
import os
from sklearn import preprocessing
import scipy.io
from torch.utils.data import Dataset
from time import time
from torch import nn, optim
from torchvision.models import vgg16
from torch.nn import functional as F
import torch
from torchvision import  datasets, transforms, models, get_image_backend
from torchvision.transforms import Compose
from torch.utils.data import SubsetRandomSampler, DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from PIL import Image
from matplotlib import pyplot as plt
import zipfile

## Optical Flow

In [0]:
!git clone https://github.com/sniklaus/pytorch-liteflownet
!bash /content/pytorch-liteflownet/download.bash
!pip install flowiz -U
!pip install numpy tqdm matplotlib eel
!apt install ffmpeg

In [0]:
!git clone https://github.com/georgegach/flowiz.git
import flowiz as fz

In [0]:
def get_op_flow(frame1,frame2):
    
    imgc =np.array( cv2.imread('/content/sec.png', cv2.IMREAD_UNCHANGED))
    imgc = np.array(cv2.imread('/content/sec.png', cv2.IMREAD_UNCHANGED))
    width = 1024
    height = 436 # keep original height
    dim = (width, height)
    resized1 = cv2.resize(frame1, dim, interpolation = cv2.INTER_AREA)
    resized2 = cv2.resize(frame2, dim, interpolation = cv2.INTER_AREA)
    count = 1
    cv2.imwrite("frame%d.png" % count, resized1)
    count = 2
    cv2.imwrite("frame%d.png" % count, resized2)

    !python /content/pytorch-liteflownet/run.py --model sintel --first /content/frame1.png --second /content/frame2.png --out ./outfull.flo
    files = glob('/content/outfull.flo')
    optical_frame_org = fz.convert_from_file(files[0])
    
    return optical_frame_org

## Cropping

In [0]:
sys.path.append ("/content/drive/My Drive/crop")

In [0]:
from crop import crop_dims

def cropping(image,box):
    
    croped_img = image[ box[0][1]:box[0][3],box[0][0]:box[0][2]]
    return croped_img


# Data

## Download & preparing data dirs

In [0]:
data_path = "/content/drive/My Drive/data"

In [0]:
if not os.path.exists(data_path):
    os.mkdir(data_path)
    !wget ftp://ftp.merl.com/pub/tmarks/MERL_Shopping_Dataset/Videos_MERL_Shopping_Dataset.zip
    with zipfile.ZipFile("/content/Videos_MERL_Shopping_Dataset.zip", 'r') as zip_ref:
        zip_ref.extractall(data_path)
    !wget ftp://ftp.merl.com/pub/tmarks/MERL_Shopping_Dataset/Labels_MERL_Shopping_Dataset.zip
    with zipfile.ZipFile("/content/Labels_MERL_Shopping_Dataset.zip", 'r') as zip_ref:
        zip_ref.extractall(data_path)
  

In [0]:
videos_path = os.path.join(data_path,'/content/drive/My Drive/data/Videos_MERL_Shopping_Dataset')
test_path = os.path.join(data_path,'test')
valid_path = os.path.join(data_path,'valid')
labels_path = os.path.join(data_path,'Labels_MERL_Shopping_Dataset')


## Transformation 

In [27]:
train_transforms = transforms.Compose([ transforms.ToPILImage(),
                                        transforms.Scale((224,224)),
                                        transforms.RandomRotation(10),
                                        transforms.ToTensor()])

test_transforms = transforms.Compose([transforms.ToPILImage(),
                                      transforms.Scale((224,224)),
                                      transforms.ToTensor()])

  "please use transforms.Resize instead.")


## Generate Data files

In [0]:
def video_frames(path):
    
    vidcap = cv2.VideoCapture(path)
    fv = []
    success,image = vidcap.read()
    fv.append(image)
    i = 0
    while success:
        i+=1
        success,image = vidcap.read()
        fv.append(image)
        print('\r Frame no. {}'.format(i),end='')
    fv = np.asarray(fv)
    return fv


In [0]:
actions = ["Reach To Shelf","Retract From Shelf","Hand In Shelf","Inspect Product","Inspect Shelf"] 

# Data Loader

In [0]:
def data_len(mat_file):
    num_seq = 0
    for f in mat_file:
        for i,j in f[0]:
            num_seq += int((j-i+1)/6.5)

    return num_seq
  
def video_frames(path):
    vidcap = cv2.VideoCapture(path)
    fv = []
    success,image = vidcap.read()
    fv.append(image)
    i = 0
    while success:
        i+=1
        success,image = vidcap.read()
        fv.append(image)
        print('\r Frame no. {}'.format(i),end='')
    fv = np.asarray(fv)
    return fv

In [0]:
class Data_loader(Dataset):
    
    def __init__(self,labels_path,vidoes_path,transform):
        self.transform = transform
        self.labels_path = labels_path
        self.videos_path = vidoes_path
        self.mat_files =[]
        for file in os.listdir(self.labels_path):
            if file[-4:] =='.mat':
                self.mat_files.append(file)
        self.file_indx = 0
        self.video_frames = None
        self.mat_file = None
        self.class_num = 0
        self.class_subset = 0
        self.start = -1
        self.end = -1
        
    def __len__(self):  
        total = 0
        for mat_file in self.mat_files :
            file = scipy.io.loadmat(os.path.join(self.labels_path, mat_file))['tlabs']
            total += data_len(file)
        return total
    def __getitem__(self,indx):
        
        if self.video_frames == None:
            self.video_frames = video_frames(os.path.join(self.videos_path,self.mat_files[self.file_indx][:-9]+'crop.mp4'))
          
       
        if self.mat_file == None:
            self.mat_file = scipy.io.loadmat(os.path.join(self.labels_path, self.mat_files[self.file_indx]))['tlabs']
        
        if self.start == -1 or self.end == -1:
#             print(str(os.path.join(self.labels_path, self.mat_files[self.file_indx]))+'\t'+str(self.class_num)+'\t'+str(self.class_subset))
            self.start, self.end = self.mat_file[self.class_num][0][self.class_subset]
            
        if (int(self.start)-int(self.end)) >= 0:
            self.start, self.end = -1, -1
            self.class_subset +=1
           
            if self.class_subset >= len(self.mat_file[self.class_num][0]) -1:
                self.class_subset = 0
                self.class_num += 1
                if self.class_num > len(self.mat_file)-1:
                    self.class_num = 0
                    del self.video_frames
                    self.video_frames = None
                    self.mat_file = None
                    self.file_indx +=1
        
        if self.video_frames == None:
#             print(os.path.join(self.videos_path,self.mat_files[self.file_indx][:-9]+'crop.mp4'))
            self.video_frames = video_frames(os.path.join(self.videos_path,self.mat_files[self.file_indx][:-9]+'crop.mp4'))
         
        
        if self.mat_file == None:
            self.mat_file = scipy.io.loadmat(os.path.join(self.labels_path, self.mat_files[self.file_indx]))['tlabs']
        if self.start == -1 or self.end == -1:
            print(str(os.path.join(self.labels_path, self.mat_files[self.file_indx]))+'\t'+str(self.class_num)+'\t'+str(self.class_subset))
            self.start, self.end = self.mat_file[self.class_num][0][self.class_subset]
  
        seq = []
        for i in range(6):
            ###Code###  
            frame = self.video_frames[self.start]
            cropping_dim = crop_dims(frame)
            cropped_img = cropping(frame, cropping_dim)
            
            frame_aux = self.video_frames[max(self.start-5,0)]
            
            optical_frame_org =  get_op_flow(frame,frame_aux) 
            cropped_opt_img = cropping(optical_frame_org, cropping_dim)

            
            frame = self.transform(frame)
            cropped_img = self.transform(cropped_img)
            optical_frame_org = self.transform(optical_frame_org)
            cropped_opt_img = self.transform(cropped_opt_img)
            
            imgs = torch.stack([optical_frame_org,cropped_opt_img,cropped_img,frame])
            seq.append(imgs)

            self.start +=1
        seq = torch.stack(seq)
        return seq, np.ones(6)*self.class_num

In [0]:
train_data = Data_loader(labels_path,
                         videos_path,train_transforms)
valid_data = Data_loader(valid_path,
                         videos_path,train_transforms)
test_data = Data_loader(test_path,
                         videos_path,test_transforms)

# Displayers 

In [0]:
def imshow(img):
    
    plt.imshow(np.transpose(img, (1, 2, 0)))  # convert from Tensor image

In [0]:
def display_graph(train_losses, valid_losses, train_accs, valid_accs):
    plt.plot(train_losses, label='Training loss')
    plt.plot(valid_losses, label='Validation loss')
    plt.plot(train_accs, label='Train accuracy')
    plt.plot(valid_accs, label='Validation accuracy')
    plt.legend(frameon=False)

# Model

In [0]:
class Encoder(nn.Module):
    
    def __init__(self, latent_dim,num_projection_img):
        super(Encoder, self).__init__()
        
        vgg = vgg16(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(vgg.children())[:-1])
        for param in self.feature_extractor.parameters():
            param.requires_grad = False

        self.final = nn.Sequential(
            nn.Linear(vgg.classifier[0].in_features, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01)
        )
        self.final = nn.Sequential(
            nn.Linear(vgg.classifier[0].in_features*num_projection_img, latent_dim, nn.BatchNorm1d(latent_dim, momentum=0.01)
        ))
    def forward(self,x):
        batch, _, c, h, w = x.shape 
        x = x.view(-1, c,h,w)
        x=self.feature_extractor(x)
        x = x.view(batch,-1)        
        x=self.final(x)
        return x
        

In [0]:
class Attention(nn.Module):
    def __init__(self, latent_dim, hidden_dim, attention_dim):
        super(Attention, self).__init__()
        self.latent_attention = nn.Linear(latent_dim, attention_dim)
        self.hidden_attention = nn.Linear(hidden_dim, attention_dim)
        self.joint_attention = nn.Linear(attention_dim, 1)

    def forward(self, latent_repr, hidden_repr):
        if hidden_repr is None:
            hidden_repr = [
                Variable(
                    torch.zeros(latent_repr.size(0), 1, self.hidden_attention.in_features), requires_grad=False
                ).float()
            ]
        h_t = hidden_repr[0]
        latent_att = self.latent_attention(latent_att)
        hidden_att = self.hidden_attention(h_t)
        joint_att = self.joint_attention(F.relu(latent_att + hidden_att)).squeeze(-1)
        attention_w = F.softmax(joint_att, dim=-1)
        return attention_w


In [0]:
class ConvLSTM(nn.Module):
    def __init__(self, num_classes, num_projection_img=4, latent_dim=512, lstm_layers=1, hidden_dim=1024, bidirectional=False, attention=False):
        super(ConvLSTM, self).__init__()
        self.latent_dim=latent_dim
        self.encoder = Encoder(latent_dim,num_projection_img)
        self.lstm = nn.LSTM(latent_dim, hidden_dim,lstm_layers,bidirectional)

        self.output_layers = nn.Sequential(
            nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim, momentum=0.01),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes),
        )
        self.attention = attention
        self.attention_layer = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, 1)

    def forward(self, x):
      
        batch_size, seq_length, num_projection_img, c, h, w = x.shape
        x = x.view(batch_size * seq_length , num_projection_img, c, h, w)
        x = self.encoder(x)
        
        x = x.view(batch_size, seq_length, -1)
        x,_ = self.lstm(x)
        
        if self.attention:
            self.attention_layer(x).squeeze(-1)
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        x = x.view(batch_size*seq_length,-1)
        x = self.output_layers(x)
        
        return x



In [0]:
model = ConvLSTM(num_classes=5, latent_dim=512)
model.cuda();

In [0]:
if not os.path.exists('/content/drive/My Drive/Models'):
    os.mkdir('/content/drive/My Drive/Models')
model_file = "/content/drive/My Drive/Models/action_recognation.pth"
back_up_model_file = "/content/drive/My Drive/Models/action_recognation_backup.pth"

# Workers

In [0]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train(model, train_loader, valid_loader, n_epochs, model_file, back_up_model_file):
    
    train_losses = []
    train_accs = []

    valid_losses = []
    valid_accs = []
    min_valid_loss = np.Inf
    
    for e in range(1,n_epochs+1):
  
        epoch_start = time()
        batch_number = 0

        train_loss = 0
        train_acc = 0
        batch_start = time()
        model.train()   
        for x,y in train_loader:
            batch_number += 1
           
            if torch.cuda.is_available() :
                x, y = x.cuda(), y.cuda()
            y =  y.long()
            batchsize, seq,img_no,c,h,w = x.shape
            optimizer.zero_grad()
            y_ = model(x) 

            y = y.view(-1,1)
            y = y.squeeze()
            y_ = y_.view(batchsize*seq,-1)
            loss = criterion(y_,y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            ps = torch.exp(y_)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == y.view(*top_class.shape)
            train_acc += torch.mean(equals.type(torch.FloatTensor))
            delay = time()-batch_start
            
            print("\rbatch size : {}  batch per total no of batches : {}/{} \
            \ntrain batch finished : {:.3f} % \ntime left : {}s \ndelay : {}s \
            \nloss : {}\n\n".format(len(x), batch_number, len(train_loader),
            batch_number/len(train_loader) *100., delay * (len(train_loader)-\
            batch_number), delay, loss.item()))
            
            batch_start = time()
            torch.save(model.state_dict(), back_up_model_file)

            
            
        valid_loss = 0
        valid_acc = 0
        model.eval()

        with torch.no_grad():
            batch_number = 0
            batch_start = time()
            acc = 0
            for x,y in valid_loader:
                batch_number += 1
                if torch.cuda.is_available() :
                    x, y = x.cuda(), y.cuda()
                y =  y.long()
                batchsize, seq,img_no,c,h,w = x.shape

                y_ = model(x)       
                y = y.view(-1,1)
                y = y.squeeze()
                y_ = y_.view(batchsize*seq,-1)
                loss = criterion(y_,y)
                valid_loss += loss.item()
                
               
                ps = torch.exp(y_)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == y.view(*top_class.shape)
                valid_acc += torch.mean(equals.type(torch.FloatTensor))
                delay = time()-batch_start
      
                print("batch size : {}\nbatch per total no of batches : {}/{} \
                \ntrain batch finished : {:.3f} % \ntime lift : {} s\
                \ndelay : {}s \nloss : {}\n\n".format(len(x),
                batch_number, len(valid_loader), batch_number/len(valid_loader)\
                      *100., delay * (len(valid_loader)-batch_number), delay,\
                                                      loss.item()))
                batch_start = time()
                

        train_loss /= len(train_loader)     
        train_acc /= len(train_loader)  

        valid_loss /= len(valid_loader)
        valid_acc /= len(valid_loader)

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)

        if min_valid_loss > valid_loss:
            print ('Validation loss decreased ({:.6f} --> {:.6f}). \
            Saving model ...\n'.format(min_valid_loss, valid_loss))
            min_valid_loss = valid_loss
            torch.save(model.state_dict(), model_file)
        
        delay = time() - epoch_start


        print("Epoch : {} \nTrain Finished : {:.3f} %\nTime Left : {:.3f} s\
        \nTraining Loss : {:.6f} \nValidation Loss : {:.6f} \nTrain Accuracy :\
        {:.3f} %\nValidation Accuracy : {:.6f} %\nDelay : {:.6f} s \n\n".format(
            e,e / n_epochs * 100., delay * (n_epochs - e) ,train_loss, valid_loss
            ,train_acc * 100.,valid_acc * 100.,delay))
        
    return train_losses, train_accs, valid_losses, valid_accs

#Run

In [0]:
batch_size = 1
train_loader = DataLoader(train_data,batch_size=batch_size)
valid_loader = DataLoader(valid_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [0]:
train_losses, train_accs, valid_losses, valid_accs = train(model, train_loader, valid_loader,100,model_file,back_up_model_file)