<a href="https://colab.research.google.com/github/MellowMello/EpicKitchens/blob/main/EpicKitchens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pretrainedmodels



In [None]:
import torch.hub
repo = 'epic-kitchens/action-models'

class_counts = (125, 352)
segment_count = 8
base_model = 'resnet50'
tsn = torch.hub.load(repo, 'TSN', class_counts, segment_count, 'RGB',
                     base_model=base_model, 
                     pretrained='epic-kitchens', force_reload=True)
trn = torch.hub.load(repo, 'TRN', class_counts, segment_count, 'RGB',
                     base_model=base_model, 
                     pretrained='epic-kitchens')
mtrn = torch.hub.load(repo, 'MTRN', class_counts, segment_count, 'RGB',
                     base_model=base_model, 
                      pretrained='epic-kitchens')
#tsm = torch.hub.load(repo, 'TSM', class_counts, segment_count, 'RGB',
 #                    base_model=base_model, 
  #                   pretrained='epic-kitchens')

# Show all entrypoints and their help strings
for entrypoint in torch.hub.list(repo):
    print(entrypoint)
    print(torch.hub.help(repo, entrypoint))

batch_size = 1
segment_count = 8
snippet_length = 1  # Number of frames composing the snippet, 1 for RGB, 5 for optical flow
snippet_channels = 3  # Number of channels in a frame, 3 for RGB, 2 for optical flow
height, width = 224, 224

inputs = torch.randn(
    [batch_size, segment_count, snippet_length, snippet_channels, height, width]
)
# The segment and snippet length and channel dimensions are collapsed into the channel
# dimension
# Input shape: N x TC x H x W
inputs = inputs.reshape((batch_size, -1, height, width))
for model in [tsn, trn, mtrn]: #, tsm]:
    # You can get features out of the models
    features = model.features(inputs)
    # and then classify those features
    verb_logits, noun_logits = model.logits(features)
    
    # or just call the object to classify inputs in a single forward pass
    verb_logits, noun_logits = model(inputs)
    print(verb_logits.shape, noun_logits.shape)

Downloading: "https://github.com/epic-kitchens/action-models/archive/master.zip" to /root/.cache/torch/hub/master.zip
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master


Multi-Scale Temporal Relation Network Module in use ['8-frame relation', '7-frame relation', '6-frame relation', '5-frame relation', '4-frame relation', '3-frame relation', '2-frame relation']


Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master
Using cache found in /root/.cache/torch/hub/epic-kitchens_action-models_master


MTRN

    Multi-scale Temporal Relational Network

    See https://arxiv.org/abs/1711.08496 for more details.
    Args:
        num_class:
            Number of classes, can be either a single integer,
            or a 2-tuple for training verb+noun multi-task models
        num_segments:
            Number of frames/optical flow stacks input into the model
        modality:
            Either ``RGB`` or ``Flow``.
        base_model:
            Backbone model architecture one of ``resnet18``, ``resnet30``,
            ``resnet50``, ``BNInception``, ``InceptionV3``, ``VGG16``.
            ``BNInception`` and ``resnet50`` are the most thoroughly tested.
        new_length:
            The number of channel inputs per snippet
        consensus_type:
            The consensus function used to combined information across segments.
            One of ``avg``, ``max``, ``TRN``, ``TRNMultiscale``.
        before_softmax:
            Whether to output class score before or after softmax.
     

In [None]:
from tsn import TSN, TRN, MTRN
import torch

#model_path = '/content/drive/My Drive/2tw6gdvmfj3f12papdy24flvmo/TSN_arch=resnet50_modality=RGB_segments=8-3ecf904f.pth.tar'
model_path = '/content/drive/My Drive/2tw6gdvmfj3f12papdy24flvmo/MTRN_arch=resnet50_modality=RGB_segments=8-46337796.pth.tar'

verb_class_count, noun_class_count = 125, 352
class_count = (verb_class_count, noun_class_count)
ckpt = torch.load(model_path)
#model = TSN(
#model = TRN(
model = MTRN(
    num_class=class_count,
    num_segments=ckpt['segment_count'],
    modality=ckpt['modality'],
    base_model=ckpt['arch'],
    dropout=ckpt['args'].dropout
)
model.load_state_dict(ckpt['state_dict'])

Multi-Scale Temporal Relation Network Module in use ['8-frame relation', '7-frame relation', '6-frame relation', '5-frame relation', '4-frame relation', '3-frame relation', '2-frame relation']


<All keys matched successfully>

In [None]:
import tarfile
with tarfile.open('/content/drive/My Drive/P01_13.tar') as tar_file:
  tar_file.extractall('./P01_13/')

In [None]:
from PIL import Image
import os
import numpy as np
import torchvision.transforms as transforms

def create_dataset(image_dir, frame_limit = 1000):
    imgs = []
    for i, image_name in enumerate(os.listdir(image_dir)):
        if i > frame_limit:
          break
        image = Image.open(image_dir + image_name)
        image = transforms.ToTensor()(image)
        image = np.array(image)
        imgs.append(image)
    images = np.array(imgs)
    dataset = transforms.ToTensor()(images[0])
    for i in range(1,len(images)):
      torch.cat((dataset, transforms.ToTensor()(images[i])))
    return dataset

In [None]:
img_dir = 'P01_13/'
images = create_dataset(img_dir)

In [None]:
output = model(images)

In [None]:
def obtain_ids(output):
  verbids = []
  nounids = []

  for i in range(len(output[0])):
    verbids.append(int(torch.argmax(output[0][i])))

  for i in range(len(output[1])):
    nounids.append(int(torch.argmax(output[1][i])))

  return verbids, nounids

In [None]:
import csv

def obtain_action(verbids, nounids):
  nouns = []
  verbs = []
  noun_file = '/content/drive/My Drive/epic-kitchens-55-annotations-master/EPIC_noun_classes.csv'
  verb_file = '/content/drive/My Drive/epic-kitchens-55-annotations-master/EPIC_verb_classes.csv'

  with open(noun_file) as n_f:
    noun_classes = []
    temp_noun = csv.reader(n_f)
    for row in temp_noun:
      noun_classes.append(row)
    for i in range(len(nounids)):
      nouns.append(noun_classes[nounids[i]+1][1])

  with open(verb_file) as v_f:
    verb_classes = []
    temp_verb = csv.reader(v_f)
    for row in temp_verb:
      verb_classes.append(row)
    for i in range(len(verbids)):
      verbs.append(verb_classes[verbids[i]+1][1])
  return verbs, nouns

In [None]:
testverbids, testnounids = obtain_ids(output)
testverb, testnoun = obtain_action(testverbids, testnounids)

print(testverbids)
print(testnounids)
print(testverb)
print(testnoun)

[1, 4, 4, 4, 4, 1, 3, 0, 2, 1, 0, 0, 0, 0, 0, 1, 4, 4, 2]
[72, 11, 4, 19, 7, 1, 8, 11, 43, 16, 1, 11, 1, 4, 4, 63, 32, 11, 82]
['put', 'wash', 'wash', 'wash', 'wash', 'put', 'close', 'take', 'open', 'put', 'take', 'take', 'take', 'take', 'take', 'put', 'wash', 'wash', 'open']
['leaf', 'lid', 'plate', 'board:chopping', 'spoon', 'pan', 'cupboard', 'lid', 'milk', 'glass', 'pan', 'lid', 'pan', 'plate', 'plate', 'tray', 'cloth', 'lid', 'machine:washing']


In [None]:
from torchvision.transforms import Compose
from transforms import GroupScale, GroupCenterCrop, GroupOverSample, Stack, ToTorchFormatTensor, GroupNormalize

crop_count = 10
net = model
backbone_arch = 'resnet50'

if crop_count == 1:
    cropping = Compose([
        GroupScale(net.scale_size),
        GroupCenterCrop(net.input_size),
    ])
elif crop_count == 10:
    cropping = GroupOverSample(net.input_size, net.scale_size)
else:
    raise ValueError("Only 1 and 10 crop_count are supported while we got {}".format(crop_count))

transform = Compose([
    cropping,
    Stack(roll=backbone_arch == 'BNInception'),
    ToTorchFormatTensor(div=backbone_arch != 'BNInception'),
    GroupNormalize(net.input_mean, net.input_std),
])

  "please use transforms.Resize instead.")


In [None]:
from torch.utils.data import Dataset, DataLoader

class CookingDataset(Dataset):
  def __init__(self):
    label_file = '/content/drive/My Drive/COOKING_action_labels.csv'
    #label_file = '/content/drive/My Drive/COOKING_action_labels_20.csv'
    cooking_dir = '/content/drive/My Drive/COOKING POV/'
    image_names = sorted(os.listdir(cooking_dir))

    xy = np.loadtxt(label_file, delimiter = ',', dtype = np.float32, skiprows = 1, usecols = (0,1,2,3))
    self.y = torch.from_numpy(xy[:,[2,3]])
    self.nsamples = xy.shape[0]

    start_frames = xy[:, [0]].astype(int)
    stop_frames = xy[:, [1]].astype(int)
    self.x = []
    for i in range(self.nsamples):
    #for i in range(2):
      images = []
      for j in range(start_frames[i][0], stop_frames[i][0]):
        image = Image.open(cooking_dir + image_names[j])
        #image = image.resize((114,64))
        images.append(image)
      action = transform(images)
      self.x.append(action)

  def __getitem__(self, index):
    return self.x[index], self.y[index]
    
  def __len__(self):
    return self.nsamples

In [None]:
cooking_dataset = CookingDataset()

In [None]:
cooking_dataloader = DataLoader(dataset=cooking_dataset, batch_size=1, shuffle=False, num_workers=0)

In [None]:
def custom_loss(prediction, label):
  lab1 = np.zeros(len(prediction[0][0]))
  lab2 = np.zeros(len(prediction[1][0]))
  lab1[int(label[0][0])] = 1
  lab2[int(label[0][1])] = 1
  lab1 = torch.from_numpy(lab1)
  lab2 = torch.from_numpy(lab2)

  mse1 = torch.mean((lab1 - prediction[0][0])**2)
  mse2 = torch.mean((lab2 - prediction[1][0])**2)

  loss = torch.mean(torch.tensor([mse1, mse2]))
  loss.requires_grad = True
  return loss

In [None]:
import torch.nn as nn
from statistics import mean

learning_rate = 0.01
num_epochs = 10

#loss = nn.MSELoss()
optimiser = torch.optim.SGD(model.parameters(), lr = learning_rate)

for epoch in range(num_epochs):
  epoch_loss = []
  for i, (actions, label) in enumerate(cooking_dataloader):
    prediction = model(actions)

    loss = custom_loss(prediction, label)
    epoch_loss.append(float(loss))
    loss.backward()

    optimiser.step()
    optimiser.zero_grad()

  epoch_loss = np.array(epoch_loss)
  epoch_loss = mean(epoch_loss)
  print(f'epoch {epoch+1}/{num_epochs}: loss = {epoch_loss:.8f}')
