In [1]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
from torchvision.io.video import read_video
from torchinfo import summary
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from torchmetrics.classification import BinaryPrecision, BinaryRecall
import os
import PIL
from tqdm.notebook import tqdm
import lightning.pytorch as L
from sklearn.metrics import classification_report

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Import from PyTorchVideo

In [3]:
import pytorchvideo
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RemoveKey,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    Resize,
)



# Import WandB

In [4]:
import wandb

wandb_logger = L.loggers.WandbLogger(
    project="3DCNN_attention_frame_error",
    tags=['frame error','capture/emulator'],
    config={
        "learning_rate": 0.001,
        "epochs": 100,
        "batch_size": 64,
    },
    name="testing_3DCNN_more_nonlocal_blocks"
)

[34m[1mwandb[0m: Currently logged in as: [33mngkhaiphu[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112911111109094, max=1.0…

In [5]:
def get_data(path):
    video_list = []
    
    for root, dirs, files in os.walk(path):
        for name in files:
            video_list.append(os.path.join(root, name))

    return video_list

In [6]:
root = './data/'
lst = get_data(root)
lst

['./data/black screen when trigger Siri/black screen when trigger Siri_585_600_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_180_195_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_640_655_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1245_1260_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_215_230_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_410_425_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_320_335_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1085_1100_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_775_790_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_860_875_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_75_90_0.mp4',
 './data/black screen when trigger Siri/b

In [7]:
df = pd.DataFrame(lst,columns=["path"])

In [8]:
df

Unnamed: 0,path
0,./data/black screen when trigger Siri/black sc...
1,./data/black screen when trigger Siri/black sc...
2,./data/black screen when trigger Siri/black sc...
3,./data/black screen when trigger Siri/black sc...
4,./data/black screen when trigger Siri/black sc...
...,...
6313,./data/CP map CP/CP map CP_95_110_0.mp4
6314,./data/CP map CP/CP map CP_225_240_0.mp4
6315,./data/CP map CP/CP map CP_105_120_0.mp4
6316,./data/CP map CP/CP map CP_310_325_0.mp4


In [9]:
index = np.random.choice(range(5717), 5717-603, replace=False)
index

array([3551, 1503, 5583, ...,  139, 4050, 5293])

In [10]:
for i in range(len(df)):
    df.loc[i,"label"] = df.loc[i,"path"].split('.')[-2][-1]
df

Unnamed: 0,path,label
0,./data/black screen when trigger Siri/black sc...,0
1,./data/black screen when trigger Siri/black sc...,0
2,./data/black screen when trigger Siri/black sc...,0
3,./data/black screen when trigger Siri/black sc...,0
4,./data/black screen when trigger Siri/black sc...,0
...,...,...
6313,./data/CP map CP/CP map CP_95_110_0.mp4,0
6314,./data/CP map CP/CP map CP_225_240_0.mp4,0
6315,./data/CP map CP/CP map CP_105_120_0.mp4,0
6316,./data/CP map CP/CP map CP_310_325_0.mp4,0


In [11]:
df2 = df[df['label'] == '0'].reset_index(drop = True)
df2

Unnamed: 0,path,label
0,./data/black screen when trigger Siri/black sc...,0
1,./data/black screen when trigger Siri/black sc...,0
2,./data/black screen when trigger Siri/black sc...,0
3,./data/black screen when trigger Siri/black sc...,0
4,./data/black screen when trigger Siri/black sc...,0
...,...,...
5712,./data/CP map CP/CP map CP_95_110_0.mp4,0
5713,./data/CP map CP/CP map CP_225_240_0.mp4,0
5714,./data/CP map CP/CP map CP_105_120_0.mp4,0
5715,./data/CP map CP/CP map CP_310_325_0.mp4,0


In [12]:
for i in index:
    df2.drop(index=i,axis=0,inplace=True)

In [13]:
df2 = df2.append(df[df['label'] == '1'])
df2


  df2 = df2.append(df[df['label'] == '1'])


Unnamed: 0,path,label
1,./data/black screen when trigger Siri/black sc...,0
26,./data/black screen when trigger Siri/black sc...,0
27,./data/black screen when trigger Siri/black sc...,0
52,./data/black screen when trigger Siri/black sc...,0
54,./data/black screen when trigger Siri/black sc...,0
...,...,...
6220,"./data/press phone, white screen/press phone, ...",1
6221,"./data/press phone, white screen/press phone, ...",1
6250,./data/CP map CP/CP map CP_210_225_1.mp4,1
6265,./data/CP map CP/CP map CP_200_215_1.mp4,1


In [14]:
df2.drop(columns='label',axis=1,inplace=True)

In [15]:
lst = list(df2['path'])

In [16]:
lst

['./data/black screen when trigger Siri/black screen when trigger Siri_180_195_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1285_1300_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_190_205_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1225_1240_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_720_735_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1260_1275_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_985_1000_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_230_245_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_375_390_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_1160_1175_0.mp4',
 './data/black screen when trigger Siri/black screen when trigger Siri_195_210_0.mp4',
 './data/black screen when trigger

In [17]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
            super().__init__()
            self.dataset = dataset
            self.targets = torch.FloatTensor([int(data.split('.')[-2][-1]) for data in self.dataset])
        
            mean = [0.485, 0.456, 0.406]
            std = [0.229, 0.224, 0.225]
            resize_to = (64, 64)
            num_frames = 16
        
            self.transform =  ApplyTransformToKey(
                key="video",
                transform=Compose(
                    [
                        Lambda(lambda x: x/255.0),
                        Normalize(mean, std),
                        Resize(resize_to,antialias=True)
                    ]
                ),
            )
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        video_path = self.dataset[idx]
        label = self.targets[idx]

        # video_data, _, _ = read_video(video_path, output_format="TCHW")
        video = EncodedVideo.from_path(video_path, decode_audio=False)
        video_data = video.get_clip(start_sec=0,end_sec=0.99)
        video_data = self.transform(video_data)
        inputs = video_data["video"]
    
        return inputs, label

In [18]:
dataset = CustomDataset(dataset=lst)

In [19]:
train_size = int(0.8 * len(dataset))
val_size = int((len(dataset) - train_size) / 2)
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [20]:
batch_size = 64
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,num_workers=12)
val_loader = DataLoader(dataset=val_dataset,batch_size=batch_size,shuffle=False,num_workers=12)
test_loader = DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=False,num_workers=12)

In [28]:
class Attention(nn.Module):
    def __init__(self, size):
        super(Attention, self).__init__()
        self.hidden_size = int(size/2)
        self.conv1=nn.Conv3d(size, self.hidden_size, kernel_size=1,stride=1)
        self.conv2=nn.Conv3d(size, self.hidden_size, kernel_size=1,stride=1)
        self.conv3=nn.Conv3d(size, self.hidden_size, kernel_size=1,stride=1)
        self.conv4=nn.Conv3d(self.hidden_size, size, kernel_size=1,stride=1)

    def forward(self,x):
        theta = torch.permute(self.conv1(x).squeeze(4).squeeze(3), (0, 2, 1))
        phi = self.conv2(x).squeeze(4).squeeze(3)
        g = torch.permute(self.conv3(x).squeeze(4).squeeze(3), (0, 2, 1))
        
        attn_scores = theta @ phi #THWxC @ CxTHW = THWxTHW
        weighted_scores = F.softmax(attn_scores @ g) # THWxTHW @ THWxC = THWxC -> softmax
        weighted_scores = self.conv4(torch.permute(weighted_scores, (0, 2, 1)).unsqueeze(3).unsqueeze(4)) # -> CTHW -> conv
        sum = x + weighted_scores # skip connection
        return sum

In [29]:
class Attn3DCNN(nn.Module):
    def __init__(self):
        super(Attn3DCNN, self).__init__()
        self.conv1=nn.Conv3d(3, 32, kernel_size=3, stride=(1, 2, 2),
                               padding=(0,0,0), bias=True)
        self.BN1=nn.BatchNorm3d(32)
        self.pool1=nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1,2,2), padding=0)
        self.conv2=nn.Conv3d(32, 64, kernel_size=3, stride=(1, 2, 2),
                               padding=(0,0,0), bias=True)
        self.BN2=nn.BatchNorm3d(64)
        self.pool2=nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1,2,2), padding=0)
        self.conv3=nn.Conv3d(64, 256, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.BN3=nn.BatchNorm3d(256)
        self.conv4=nn.Conv3d(256, 512, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.BN4=nn.BatchNorm3d(512)
        self.conv5=nn.Conv3d(512, 256, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.BN5=nn.BatchNorm3d(256)
        self.pool3=nn.AvgPool3d(kernel_size=(3,3,3), stride=(1,2,2), padding=0)
        self.attn1=Attention(256)
        self.attn2=Attention(256)
        self.attn3=Attention(256)
        self.fc=nn.Linear(1536, 1)
    
    def forward(self,x):
        x=F.leaky_relu(self.conv1(x))
        x=self.pool1(self.BN1(x))
        x=F.leaky_relu(self.conv2(x))
        x=self.pool2(self.BN2(x))
        x=self.BN3(F.leaky_relu(self.conv3(x)))
        x=self.BN4(F.leaky_relu(self.conv4(x)))
        x=self.BN5(F.leaky_relu(self.conv5(x)))
        x=self.pool3(x)
        x=self.attn1(x)
        x=self.attn2(x)
        x=self.attn3(x)
        x=F.dropout(x.view(x.size(0),-1),p=0.3,inplace=True)
        x=self.fc(x)
        return F.sigmoid(x)

In [30]:
class LitAttn3DCNN(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss = nn.BCELoss()
        self.accuracy = torchmetrics.Accuracy(task='binary')
        self.prec = BinaryPrecision()
        self.rec = BinaryRecall()
        self.save_hyperparameters()
        
    def forward(self,x):
        return self.model(x)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.001)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=50,verbose=True)
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler, "monitor": "val/loss"}

    def training_step(self, train_batch, batch_idx):
        X_train, y_train = train_batch
        y_train = y_train.unsqueeze(-1)
        pred = self.model(X_train)
        loss = self.loss(pred, y_train)
        
        self.log('train/loss', loss, on_epoch=True, on_step=False)
        self.log('train/acc', self.accuracy(pred,y_train), on_epoch=True, on_step=False)
        return loss

    def validation_step(self, val_batch, batch_idx):
        X_val, y_val = val_batch
        y_val = y_val.unsqueeze(-1)
        pred = self.model(X_val)
        # predicted = torch.round(pred)
        # correct += (predicted == y_test.unsqueeze(dim=-1)).sum()
        loss = self.loss(pred, y_val)
        # self.log('precision', self.prec(pred, y_val).item(), on_epoch=True, on_step=False)
        # self.log('recall', self.rec(pred, y_val).item(), on_epoch=True, on_step=False)
        self.log('val/loss', loss)
        self.log('val/acc', self.accuracy(pred, y_val))

In [31]:
model = LitAttn3DCNN(Attn3DCNN())
print(summary(model, input_size=(64, 3, 16, 64, 64)))

  rank_zero_warn(


Layer (type:depth-idx)                   Output Shape              Param #
LitAttn3DCNN                             [64, 1]                   --
├─Attn3DCNN: 1-1                         [64, 1]                   --
│    └─Conv3d: 2-1                       [64, 32, 14, 31, 31]      2,624
│    └─BatchNorm3d: 2-2                  [64, 32, 14, 31, 31]      64
│    └─MaxPool3d: 2-3                    [64, 32, 12, 15, 15]      --
│    └─Conv3d: 2-4                       [64, 64, 10, 7, 7]        55,360
│    └─BatchNorm3d: 2-5                  [64, 64, 10, 7, 7]        128
│    └─MaxPool3d: 2-6                    [64, 64, 8, 3, 3]         --
│    └─Conv3d: 2-7                       [64, 256, 8, 3, 3]        442,624
│    └─BatchNorm3d: 2-8                  [64, 256, 8, 3, 3]        512
│    └─Conv3d: 2-9                       [64, 512, 8, 3, 3]        3,539,456
│    └─BatchNorm3d: 2-10                 [64, 512, 8, 3, 3]        1,024
│    └─Conv3d: 2-11                      [64, 256, 8, 3, 3]  

  weighted_scores = F.softmax(attn_scores @ g) # THWxTHW @ THWxC = THWxC -> softmax


In [32]:
early_stopper = L.callbacks.EarlyStopping(monitor='val/loss', mode='min', patience=15, min_delta=0.01, verbose=True)
wandb_logger.watch(model, log="all", log_freq=30)
checkpoint_callback = L.callbacks.ModelCheckpoint(
    dirpath="./3DCNN_Attn_3-11/",
    filename="{epoch:02d}-{val_loss:.2f}",
)
trainer = L.Trainer(
    logger=wandb_logger,
    callbacks=([early_stopper,checkpoint_callback]),
    max_epochs=100,
    default_root_dir='./3DCNN_Attn_3-11/'
)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=model,train_dataloaders=train_loader,val_dataloaders=val_loader)
trainer.save_checkpoint("./3DCNN_Attn_3-11/final.ckpt")

  rank_zero_warn(f"attribute '{k}' removed from hparams because it cannot be pickled")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type            | Params
---------------------------------------------
0 | model    | Attn3DCNN       | 8.0 M 
1 | loss     | BCELoss         | 0     
2 | accuracy | BinaryAccuracy  | 0     
3 | prec     | BinaryPrecision | 0     
4 | rec      | BinaryRecall    | 0     
---------------------------------------------
8.0 M     Trainable params
0         Non-trainable params
8.0 M     Total params
31.913    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 1.0000e-03.


Sanity Checking: 0it [00:00, ?it/s]

  weighted_scores = F.softmax(attn_scores @ g) # THWxTHW @ THWxC = THWxC -> softmax
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Adjusting learning rate of group 0 to 9.9901e-04.


Validation: 0it [00:00, ?it/s]

Metric val/loss improved. New best score: 1.391


Adjusting learning rate of group 0 to 9.9606e-04.


Validation: 0it [00:00, ?it/s]

Metric val/loss improved by 0.671 >= min_delta = 0.01. New best score: 0.721


In [None]:
model.eval()
model.to(device='cuda')
pred = []
gt = []
with torch.no_grad():
    correct = 0
    for X_test, y_test in test_loader:
        X_test = X_test.to(device='cuda')
        y_test = y_test.to(device='cuda')
        y_val = model(X_test)
        predicted = torch.round(y_val)
        correct += (predicted == y_test.unsqueeze(dim=-1)).sum()
        pred = np.concatenate((pred, predicted.to('cpu').numpy()),axis=None)
        gt = np.concatenate((gt, y_test.to('cpu').numpy()),axis=None)
print(f'Test accuracy: {correct.item()}/{len(test_dataset)} = {correct.item()*100/len(test_dataset):7.3f}%')
print(classification_report(pred, gt))