In [37]:
import os
import io

import clip
import einops
import torch
import numpy as np
import torch
import webdataset as wds

from torch.utils.data import DataLoader


import pandas as pd

In [38]:
!pip3.8 install clip
!pip3.8 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip3.8 install webdataset
!pip3.8 install pandas
!pip3.8 install einops

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# DataLoader

In [40]:
# Dataset code:
"""
utils for processing datasets of format described in https://github.com/iejMac/clip-video-encode/pull/13
used https://github.com/rom1504/laion-prepro/blob/main/laion5B/usage_guide/dataloader_pytorch.py as template
"""


def standardize_embedding_shape(emb, seq_len):
    if len(emb) > seq_len:
        print(f"Warning: Raw embedding is longer than standard sequence length ({len(emb)} > {seq_len})")
        emb = emb[:seq_len]

    pad = np.zeros((seq_len - len(emb), emb.shape[1]), dtype=emb.dtype)
    padded_emb = np.concatenate([emb, pad])
    return padded_emb


def create_embeddingwebdataset(
    urls,
    embedding_transform=lambda emb: emb,
    standard_seq_len=-1,
    to_tensor=True,
    enable_text=True,
    enable_meta=True,
):
    """
    Create a WebDataset reader for Frame Embedding Dataset
    Input:
        standard_seq_len: sequence length to pad all embedding sequences to (for batching)
            !(-1) : pad to standard_seq_len
            -1: don't pad (dataset can't be used in DataLoader with batch_size > 1)
        enable_text: include text captions
        enable_meta: include metadata
    """

    dataset = wds.WebDataset(urls)
    # TODO: different tokeinzers??
    tokenizer = lambda text: clip.tokenize([text], truncate=True)[0]

    def preprocess_dataset(item):
        output = {}

        npy_data = item["npy"]
        stream = io.BytesIO(npy_data)
        emb = np.lib.format.read_array(stream)

        if standard_seq_len != -1:
            emb = standardize_embedding_shape(emb, standard_seq_len)
        if to_tensor:
            emb = torch.from_numpy(emb)

        output["embeddings"] = embedding_transform(emb)

        if enable_text:
            text_data = item["cap"]
            text = text_data.decode("utf-8")
            output["text"] = text
            output["text_tokens"] = tokenizer(text)
        if enable_meta:
            meta_data = item["json"]
            meta = meta_data.decode("utf-8")
            output["meta"] = meta
        return output

    transformed_dataset = dataset.map(preprocess_dataset, handler=wds.handlers.warn_and_continue)
    return transformed_dataset


def dataset_to_dataloader(dataset, batch_size, num_prepro_workers):
    """converts WebDataset to PyTorch DataLoader."""

    dl = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_prepro_workers,
        pin_memory=True,
        prefetch_factor=2,
    )

    return dl


class EmbeddingWebDatasetReader:
    """WebDataset reader for Embedding Datasets"""

    def __init__(
        self,
        urls,
        standard_seq_len,
        batch_size,
        num_prepro_workers,
        to_tensor=True,
        enable_text=True,
        enable_meta=False,
        embedding_transform=lambda emb: emb,
    ):
        self.batch_size = batch_size
        dataset = create_embeddingwebdataset(
            urls,
            embedding_transform,
            standard_seq_len,
            to_tensor,
            enable_text,
            enable_meta,
        )
        self.dataloader = dataset_to_dataloader(dataset, batch_size, num_prepro_workers)

    def __iter__(self):
        for batch in self.dataloader:
            yield batch

# Modeling

In [41]:
"""
Positional Encodings
"""
import math
import torch

from einops import rearrange, repeat
from torch import nn


class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
    
"""
Transformer for encoding sequences of frame embeddings
"""

# source - https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)


class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)


class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))


    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

class VideoEmbeddingTransformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, proj_dim=None, dropout = 0.):
      super().__init__()
      self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
      self.pos_encoding = PositionalEncoding(dim, dropout)

      self.proj = None if proj_dim is None else nn.Sequential(
          nn.Linear(dim, (dim+proj_dim)//2),
          nn.GELU(),
          nn.Linear((dim+proj_dim)//2, proj_dim),
      )

    def forward(self, x):
      x = self.pos_encoding(x)
      x = self.transformer(x)

      x = x[..., 0, :] # first embed = video embedding

      if self.proj is not None:
        x = self.proj(x)

      return x

In [49]:
DATA_DIR = "/home/iejmac/wds_kinetics"
splits = pd.read_csv(os.path.join(DATA_DIR, "splits.csv"))

In [50]:
train_tars = splits[splits["split"] == "train"]["tar_file"].tolist()
val_tars = splits[splits["split"] == "val"]["tar_file"].tolist()

train_tars_paths = [os.path.join(DATA_DIR, t + ".tar") for t in train_tars]
val_tars_paths = [os.path.join(DATA_DIR, t + ".tar") for t in val_tars]

In [56]:
all_labels = pd.read_csv(os.path.join(DATA_DIR, "annotations/train.csv"))["label"].unique().tolist()
len(all_labels)

700

# Training

In [57]:
# MODEL PARAMS:
DIM = 512
DEPTH = 12
HEADS = 8
DIM_HEAD = 128
MLP_DIM = 512
PROJ_DIM = 700
DROPOUT=0.0

In [61]:
tf = VideoEmbeddingTransformer(
    dim=DIM,
    depth=DEPTH,
    heads=HEADS,
    dim_head=DIM_HEAD,
    mlp_dim=MLP_DIM,
    proj_dim=PROJ_DIM,
    dropout=DROPOUT,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
tf = tf.to(device)

In [59]:
# from open_clip/training/scheduler.py
def assign_learning_rate(optimizer, new_lr):
    for param_group in optimizer.param_groups:
        param_group["lr"] = new_lr


def _warmup_lr(base_lr, warmup_length, step):
    return base_lr * (step + 1) / warmup_length


def cosine_lr(optimizer, base_lr, warmup_length, steps):
    def _lr_adjuster(step):
        if step < warmup_length:
            lr = _warmup_lr(base_lr, warmup_length, step)
        else:
            e = step - warmup_length
            es = steps - warmup_length
            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
        assign_learning_rate(optimizer, lr)
        return lr
    return _lr_adjuster

In [64]:
LR = 5e-4
WEIGHT_DECAY = 0.0
GRAD_CLIP = 1.0
LAMBDA = 0.8
EPOCHS = 20

WARMUP_STEPS = 1000
ALL_STEPS = 120000

SEQ_LEN = 25
BATCH_SIZE = 128
NUM_PREPRO = 6

step = 0

In [62]:
loss_f = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(tf.parameters(), lr=LR, weight_decay=0.0)

# lr_schedule = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda=lambda step: LAMBDA**step)
lr_schedule = cosine_lr(opt, LR, WARMUP_STEPS, ALL_STEPS)

In [65]:
val_reader = EmbeddingWebDatasetReader(
    urls=val_tars_paths,
    standard_seq_len=SEQ_LEN,
    batch_size=BATCH_SIZE,
    num_prepro_workers=NUM_PREPRO,
    to_tensor=True,
    enable_text=True,
    enable_meta=True,
    embedding_transform=lambda emb: emb,
)

train_reader = EmbeddingWebDatasetReader(
    urls=train_tars_paths,
    standard_seq_len=SEQ_LEN,
    batch_size=BATCH_SIZE,
    num_prepro_workers=NUM_PREPRO,
    to_tensor=True,
    enable_text=True,
    enable_meta=False,
    embedding_transform=lambda emb: emb,
)

In [None]:
running_loss = 0.0
for e in range(EPOCHS):
  print(f"Epoch {e}...")
  print("here")
  for i, b in enumerate(train_reader):
    step += 1
    lr_schedule(step)
    print('here')
    embeddings = b["embeddings"].float().to(device)
    labs = torch.Tensor([all_labels.index(l) for l in b["text"]]).long().to(device)
    print('here')

    opt.zero_grad()
    print('here')

    pred = tf(embeddings)
    loss = loss_f(pred, labs)

    loss.backward()

    # clip grads:
    torch.nn.utils.clip_grad_norm_(tf.parameters(), GRAD_CLIP)

    opt.step()

    running_loss += loss.item()

    if (step + 1) % 100 == 0:
      print(f"epoch {e} : step {step} average loss = {running_loss/100}")
      running_loss = 0.0



Epoch 0...
here




In [67]:
correct = 0
all_ = 0
with torch.no_grad():
  for val_b in val_reader:
    embeddings = val_b["embeddings"].float().to(device)
    labs = torch.Tensor([all_labels.index(l) for l in val_b["text"]])

    pred = tf(embeddings).cpu()
    pred_cls = torch.argmax(pred, axis=-1)

    all_ += len(labs)
    correct += torch.sum(labs == pred_cls)



KeyboardInterrupt: 

In [69]:
print(correct/all_)

NameError: name 'all_' is not defined