In [1]:
import pandas as pd
import os
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import cv2

import wandb

In [None]:
wandb.init(project="Transforming_CV", entity="javiertham")

## Custom Dataset class

1. Choose sequence length for each video
2. sample frames from each video
    - Loop the video if it is too short

Create Dataset to make use of the batching function

In [2]:
class VideoDataset(Dataset):
    '''
    df - dataframe of path to each video and their labels
    '''
    
    def __init__(self, df, seq_len=100):
        super(VideoDataset, self).__init__()
        self.df = df
        self.seq_len = seq_len
        self.transform = self.get_transforms()
    
    def get_transforms(self):
        "for MobileNetv2" 
        return transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        
    def __getitem__(self, idx):
        '''
        adapted from bleedai
        '''
        
        frames_list = []
        video_classes = []
        
        video_path = self.df.iloc[idx, 0]
        video_reader = cv2.VideoCapture(video_path)
        video_class = self.df.iloc[idx, 1]
        
        video_classes.append(video_class)
        
        # Get the total number of frames in the video.
        video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))

        # Calculate the the interval after which frames will be added to the list.
        skip_frames_window = max(int(video_frames_count / self.seq_len), 1)

        for frame_counter in range(self.seq_len):
            # Set the current frame position of the video.
            frame_position = frame_counter * skip_frames_window % video_frames_count
            video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_position)
            success, frame = video_reader.read() 

            if not success:
                break

            processed_frame = self.transform(Image.fromarray(frame))
#             processed_frame = cv2.resize(frame, (224, 224))
            frames_list.append(processed_frame)
            
#             print(processed_frame.shape)
#             print(len(frames_list))

        video_reader.release()
        
#         print(len(frames_list))
            
        return torch.stack(frames_list), video_classes[0]
    
    def __len__(self):
        return len(self.df)

### testing

In [3]:
df = pd.read_csv("../data/cleaned_data.csv")
df = df.loc[:, ['id', 'vid_class']]
df['id'] = df.loc[:, 'id'].apply(lambda x: os.path.join("..", "data", "Charades_v1", f"{x}.mp4"))
df.head()

Unnamed: 0,id,vid_class
0,../data/Charades_v1/46GP8.mp4,147
1,../data/Charades_v1/N11GT.mp4,127
2,../data/Charades_v1/KRF68.mp4,150
3,../data/Charades_v1/MJO7C.mp4,15
4,../data/Charades_v1/S6MPZ.mp4,11


In [None]:
vid_dataset = VideoDataset(df)

In [None]:
dataloader = DataLoader(vid_dataset, batch_size = 2, shuffle = True, num_workers = 0, drop_last = True)

In [None]:
import time

In [None]:
start = time.time()
X, y = next(iter(dataloader))
end = time.time()
print("duration", str(end - start))

In [None]:
X.size()

In [None]:
y

In [None]:
cnn = torchvision.models.MobileNetV2()

In [None]:
cnn = cnn.features

In [None]:
for i, child in enumerate(cnn.children()):
    if i == 0:
        for param in child.parameters():
            print(param.requires_grad)

In [None]:
cnn[:15]

---

In [34]:
class CNNLSTM(nn.Module):
    """
    Creates a CNN-LSTM model from pretrained MobileNetv2
    
    @params
    ---
    freeze_layers: freeze the cnn model parameters from 0:freeze_layers
    lstm_hidden_size: hidden size for the lstm model
    lstm_num_layers: number of layers for the lstm model
    """
    
    def __init__(self, batch_size, freeze_layers, lstm_hidden_size, lstm_num_layers):
        super(CNNLSTM, self).__init__()
        self.cnn = torchvision.models.MobileNetV2().features
        self.lstm = nn.LSTM(1280*7*7, lstm_hidden_size, lstm_num_layers, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, 157)
        
        for param in self.cnn[:freeze_layers].parameters():
            param.requires_grad = False
                
       
    def forward(self, x):
        # batch_size, sequence_length, num_channels, height, width
        B, L, C, H, W = x.size()
        x = x.view(B * L, C, H, W)
        x = self.cnn(x)
        # x.size(0): B * L
        x = x.view(x.size(0), -1)
        x = x.view(B, L, x.size(-1))
        x, (hn, cn) = self.lstm(x)
        x = x[:, -1, :].view(batch_size, -1)
        x = self.fc(x)
        
        return x

In [48]:
def train(model, device, data_loader, criterion, optimizer):
    losses = []
    scores = []
    for batch_idx, data in enumerate(data_loader):
        X, y = data
#         X, y = X.to(device), y.to(device)
#         model.to(device)
        optimizer.zero_grad()
        output = model(X)
        
        print(output.size())
        print(y.size())
        
        loss = criterion(output, y)
        losses.append(loss.item())
        
#         y_pred = torch.max(output, 1)[1]  # y_pred != output
#         step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
#         scores.append(step_score)
        
        loss.backward()
        optimizer.step()
        
#         wandb.log({"loss": loss})
        
    return losses, scores

In [49]:
learning_rate = 0.01
epochs = 1
batch_size = 5
freeze_layers = 26
lstm_hidden_size = 5
lstm_num_layers = 1

In [None]:
wandb.config = {
    "learning_rate": learning_rate,
    "epochs": epochs,
    "batch_size": batch_size,
    "freeze_layers": freeze_layers,
    "rnn_hidden_size": lstm_hidden_size,
    "rnn_num_layers": lstm_num_layers
}

In [50]:
cnnlstm = CNNLSTM(batch_size, freeze_layers, lstm_hidden_size, lstm_num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnnlstm.parameters(), lr=learning_rate)

In [51]:
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

# Data loading parameters
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0} if use_cuda else {}

In [52]:
train_data = VideoDataset(df)
dataloader = DataLoader(train_data, **params)

In [53]:
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(cnnlstm, device, dataloader, criterion, optimizer)

torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
torch.Size([5, 157])
torch.Size([5])
t

KeyboardInterrupt: 

In [None]:
for i, data in enumerate(dataloader):
    print(data[0].size())
    break

In [None]:
wandb.finish()