In [1]:
import os
import pandas as pd
import torch
import clip
from PIL import Image
import numpy as np
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale
)
import torchvision.models as models

In [2]:
torch.cuda.empty_cache()

### CLIP model

In [14]:
device0 = "cuda:0" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device0)

### Resnet-152 model

In [18]:
device1 = "cuda:1" if torch.cuda.is_available() else "cpu"

# Block fc layer
class Identity(torch.nn.Module):
    def forward(self, input_):
        return input_

resnet152 = models.resnet152(pretrained=True)
resnet152.fc = Identity()

resnet_model = resnet152.to(device1)

## Prepare data

In [4]:
video_path = '../data/HowTo100M_sample/1Wz7zVy4nPw.mp4'
video = EncodedVideo.from_path(video_path)


In [5]:
video_data = video.get_clip(0, video.duration)

In [6]:
video_data["video"].shape

torch.Size([3, 1400, 256, 455])

In [7]:
mean=(0.48145466, 0.4578275, 0.40821073)
std=(0.26862954, 0.26130258, 0.27577711)

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            ShortSideScale(
                size=224
            ),
            CenterCropVideo(224),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std)
        ]
    ),
)

# preprocess.transforms
# [Resize(size=224, interpolation=bicubic),
#  CenterCrop(size=(224, 224)),
#  <function clip.clip._convert_image_to_rgb(image)>,
#  ToTensor(),
#  Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))]


In [8]:
video_data = transform(video_data)

In [9]:
fps = int(video_data["video"].shape[1] / video.duration)
fps

30

In [10]:
indices = list(range(fps//2, video_data["video"].shape[1], fps))

video_data = video_data["video"][:, indices].permute(1,0,2,3)

In [11]:
video_data.shape

torch.Size([47, 3, 224, 224])

## Extract CLIP Features

In [19]:
video_data = video_data.to(device0)

video_features = []
for idx in range(0, video_data.shape[0], 100):
    image_features = model.encode_image(video_data[idx:idx+100])
    video_features.append(image_features.detach().cpu().numpy())

In [20]:
video_features = np.concatenate(video_features, axis=0)
video_features.shape

(47, 512)

## Extract Resnet-152 Features

In [21]:
video_data = video_data.to(device1)

resnet_model(video_data[0])

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 3, 7, 7], but got 3-dimensional input of size [3, 224, 224] instead

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("/ssd_scratch/users/mounika.k/sample_frames/EB-rxt8OLiY/03391.jpg")).unsqueeze(0).to(device)
print('image', image.shape)

with torch.no_grad():
    image_features = model.encode_image(image)
print('image_features', image_features.shape)



image torch.Size([1, 3, 224, 224])
image_features torch.Size([1, 512])


In [1]:
!ls /ssd_scratch/users/mounika.k/sample_frames/

*	     EB-rxt8OLiY  gbOyjWuszFo  IwsHKxqjDaw  oj7BrwGBL3I  rwh-Q0o3ncI
1Wz7zVy4nPw  _F-la_ujkog  IuyUiE0NC08  MVxSDIJJsPU  QjRBDa_i2ls


In [2]:
!ls ../data/HowTo100M_sample/

1Wz7zVy4nPw.mp4  gbOyjWuszFo.mp4  MVxSDIJJsPU.mp4  rwh-Q0o3ncI.mp4
EB-rxt8OLiY.mp4  IuyUiE0NC08.mp4  oj7BrwGBL3I.mp4
_F-la_ujkog.mp4  IwsHKxqjDaw.mp4  QjRBDa_i2ls.mp4


In [16]:
# video_path,feature_path
# absolute_path_video1.mp4,absolute_path_of_video1_features.npy
# absolute_path_video2.webm,absolute_path_of_video2_features.npy

absolute_path_video_dir = '../data/HowTo100M_sample/'
video_files = os.listdir(absolute_path_video_dir)

absolute_path_video_files = [absolute_path_video_dir + file for file in video_files]

absolute_path_video_features_dir = '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/'
absolute_path_video_features_files = [absolute_path_video_features_dir + filename.replace('mp4', 'npy') for filename in video_files]
absolute_path_video_features_files

['/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._gbOyjWuszFo.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/IuyUiE0NC08.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._rwh-Q0o3ncI.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/oj7BrwGBL3I.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/1Wz7zVy4nPw.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._QjRBDa_i2ls.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._MVxSDIJJsPU.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._oj7BrwGBL3I.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/EB-rxt8OLiY.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/gbOyjWuszFo.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/rwh-Q0o3ncI.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M_sample_resnext1013d/._IwsHKxqjDaw.npy',
 '/ssd_scratch/users/mounika.k/HowTo100M

In [17]:
df = pd.DataFrame({'video_path': absolute_path_video_files, 
                   'feature_path': absolute_path_video_features_files})

In [18]:
df.shape

(20, 2)

In [19]:
df.to_csv('../data/HowTo100M_sample_resnext1013d_input.csv', index=False)