In [17]:
import os
import gc
import time
import math
import torch
import hashlib
import numpy as np
import pandas as pd
import torch.nn as nn
from pathlib import Path
from itertools import product
import matplotlib.pyplot as plt
import torch.nn.functional as F

from PIL import Image
from tqdm import tqdm
from torchvision import models
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.transforms import v2 as transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [18]:
# Define seed for random
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1a8fdfe1950>

In [24]:
# Set directories and file paths
DATASET_DIR = Path("F:/ExoNet_Images/ExoNet_Images")
DATAFRAMES_DIR = Path("F:/Datasets/Tesis/")

In [26]:
test_df = pd.read_pickle(os.path.join(DATAFRAMES_DIR, "test_df.pkl"))
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108275 entries, 0 to 108274
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   108275 non-null  object
 1   frame   108275 non-null  int64 
 2   class   108275 non-null  object
 3   exist   108275 non-null  bool  
 4   path    108275 non-null  object
dtypes: bool(1), int64(1), object(3)
memory usage: 3.4+ MB


Usando foundational models para evaluar que tan bien funcionan para clasificar los frames de los videos.

In [9]:
from dataset import ExoNetDataset

# Transforms
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
MEAN_DATASET = [0.485, 0.456, 0.406]
STD_DATASET = [0.229, 0.224, 0.225]

unique_labels = test_df['class'].unique()
encoder = LabelEncoder()
encoder.fit(unique_labels)

test_image_transform = transforms.Compose([
    transforms.Resize((IMAGE_WIDTH, IMAGE_HEIGHT)),
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(
        mean=MEAN_DATASET,
        std=STD_DATASET
    )
])

In [13]:
# Datasets objects
SEQUENCE_LENGTH = 1
test_dataset = ExoNetDataset(
    df_labels=test_df,
    img_dir=DATASET_DIR,
    seq_len=SEQUENCE_LENGTH,
    target_transform=encoder.transform,
    transform=test_image_transform
)
print(f"[Test] Total video chunks of {SEQUENCE_LENGTH} frames: {len(test_dataset)}")

frames_tensor, labels_tensor = test_dataset[0]
print(f"Frames shape: {frames_tensor.shape}. Labels shape: {labels_tensor.shape}")

[Test] Total video chunks of 1 frames: 108275
Frames shape: torch.Size([1, 3, 224, 224]). Labels shape: torch.Size([1])


In [27]:
class CNN_LSTM(nn.Module):
    def __init__(
        self,
        hidden_dim,
        num_classes,
        num_layers,
        feature_extractor,
        feature_extractor_dim,
        device,
        name
    ):
        super().__init__()
        self.name = name
        self.feature_extractor = feature_extractor
        self.feature_extractor = self.feature_extractor.to(device=device)
        self.lstm = nn.LSTM(
            input_size=feature_extractor_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.classifier = nn.Linear(
            hidden_dim,
            num_classes
        )

    def forward(self, x, h=None, c=None):
        # x: (B, T, C, H, W)
        B, T, C, H, W = x.shape
        features = []
        for t in range(T):
            with torch.no_grad():
                f = self.feature_extractor(x[:, t])  # (B, C_feat, H', W')
                f = f.view(B, -1)
                features.append(f)
                
        features = torch.stack(features, dim=1)  # (B, T, C_feat, H', W')
        if h:
            _, (h, c) = self.lstm(features, (h, c))
        else:
            _, (h, c) = self.lstm(features)
            
        out = self.classifier(h[-1])
        
        return out, h, c

In [28]:
efficientnet_lstm = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
efficientnet_lstm_dim = 1280
efficientnet_name = "EfficientNetB0_lstm"

In [31]:
mobnet_convLSTM = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
mobnet_convLSTM_dim = 1280
mobnet_name = "MobileNetV2_lstm"

In [32]:
vgg16_convLSTM = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16_convLSTM_dim = 512
vgg16_name = "VGG16_lstm"

In [None]:
# define params grid
batch_sizes = [ 32 ]
device = "cuda"
lr = 1e-3
params = {
    "batches": batch_sizes,
    "epochs": [ 60 ],
    "layers": [ 1 ],
    "hidden_state_dim": [ 256 ]
}
model_names = ['MobileNetV2_lstm', 'EfficientNet_lstm', 'VGG16_lstm']

In [None]:
model_params = list(product(model_names, *params.values()))
best_results = {
    'EfficientNet_lstm': {
        'val_loss': float('inf'),
        'params': None
    },
    'MobileNetV2_lstm': {
        'val_loss': float('inf'),
        'params': None
    },
    'VGG16_lstm': {
        'val_loss': float('inf'),
        'params': None
    }
}
checkpoints_prefixes = dict()

for i, (model_name, *combo_params) in enumerate(model_params, start=1):
    if model_name == 'EfficientNet_lstm':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor=efficientnet_convLSTM.features,
            feature_dim=efficientnet_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")
    if model_name == 'MobileNetV2_convLSTM':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor= mobnet_convLSTM.features,
            feature_dim=mobnet_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")
    if model_name == 'VGG16_convLSTM':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor= vgg16_convLSTM.features,
            feature_dim=vgg16_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")

    criterion = nn.CrossEntropyLoss(weight=train_loss_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    checkpoint_prefix = hash(str(combo_params))
    checkpoints_prefixes[f'{checkpoint_prefix}'] = combo_params
    torch.save(checkpoints_prefixes, f"/mnt/f/Checkpoints/Tesis/checkpoint_prefixes.pth")
    checkpoint_path = f"/mnt/f/Checkpoints/Tesis/{model_name}_{checkpoint_prefix}.pth"
    checkpoint_path_best = f"/mnt/f/Checkpoints/Tesis/{model_name}_{checkpoint_prefix}_best.pth"
    
    train_dataset = ExoNetDataset(
        df_labels=seq_train_df,
        img_dir=DATASET_DIR,
        seq_len=SEQUENCE_LENGTH,
        target_transform=encoder.transform,
        transform=train_image_transform
    )
    val_dataset = ExoNetDataset(
        df_labels=seq_val_df,
        img_dir=DATASET_DIR,
        seq_len=SEQUENCE_LENGTH,
        target_transform=encoder.transform,
        transform=val_image_transform
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=combo_params[0],
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=combo_params[0],
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )

    model, history, val_loss = train_model(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        checkpoint_path=checkpoint_path,
        checkpoint_path_best=checkpoint_path_best,
        epochs=combo_params[1],
        device="cuda",
        n_processes=len(model_params),
        i_process=i
    )
    if val_loss < best_results[model_name]['val_loss']:
        best_results[model_name]['val_loss'] = val_loss
        best_results[model_name]['params'] = combo_params
        
    torch.save(best_results, f"/mnt/f/Checkpoints/Tesis/best_results.pth")

In [30]:
def train_model(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    checkpoint_path,
    checkpoint_path_best,
    epochs=10,
    device="cuda",
    n_processes=1,
    i_process=1
):
    best_val_loss = float("inf")
    best_weights = None
    history = {
        "train_loss": [],
        "val_loss": [],
        "train_acc": [],
        "val_acc": []
    }
    
    
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
    
        loop = tqdm(train_dataloader, total=len(train_dataloader), leave=True)
        loop.set_description(f"Epoch [{epoch}/{epochs}] - Proc: {i_process}/{n_processes}")
        
        for i, (xb, yb) in enumerate(loop, start=1):
            xb = xb.to(device)
            yb = yb.to(device).long()
    
            optimizer.zero_grad() # zero the parameter gradients
            logits, _, _ = model(xb)
            loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))
            loss.backward()
    
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # gradient clipping
            optimizer.step() # update weights
    
            running_loss += loss.item() * xb.size(0)
    
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == yb).sum().item()
            total += yb.numel()
            loop.set_postfix(loss=(running_loss/len(loop)), acc=f"{(correct/total):.4f}")
                
        epoch_loss = running_loss / len(train_dataloader)
        epoch_acc = correct / total
        history["train_loss"].append(epoch_loss)
        history["train_acc"].append(epoch_acc)
        
        save_state(checkpoint_path, model, optimizer, epoch, history)
    
        # validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        val_len = len(val_dataloader)
        with torch.no_grad():
            val_loop = tqdm(val_dataloader, total=len(val_dataloader), leave=True)
            val_loop.set_description("Validation")
            
            for j, (xb, yb) in enumerate(val_loop, start=1):
                xb = xb.to(device)
                yb = yb.to(device).long()
    
                logits, _, _ = model(xb)
                loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))
                val_loss += loss.item() * xb.size(0)
    
                preds = torch.argmax(logits, dim=-1)
                val_correct += (preds == yb).sum().item()
                val_total += yb.numel()
    
                val_loop.set_postfix(loss=val_loss/len(val_loop), acc=f"{(val_correct/val_total):.4f}")       
        
        val_loss = val_loss / val_len
        val_acc = val_correct / val_total
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)
        if val_loss < best_val_loss:
            save_state(checkpoint_path_best, model, optimizer, epoch, history)
            best_val_loss = val_loss
            best_weights = model.state_dict()

    model.load_state_dict(best_weights)
    return model, history, best_val_loss

In [24]:
x = torch.randn(10,512,32)
print(x.shape)
lstm = nn.LSTM(
            input_size=(32),
            hidden_size=224,
            num_layers=2,
            batch_first=True
        )
output, (h, c) = lstm(x)
print(output.shape, h.shape, c.shape)

torch.Size([10, 512, 32])
torch.Size([10, 512, 224]) torch.Size([2, 10, 224]) torch.Size([2, 10, 224])


In [None]:
def test_cnn_model(
    model,
    test_dataloader,
    criterion,
    device="cuda"
):
    model.eval()
    test_loss = 0.0
    test_correct = 0
    test_total = 0
    test_len = len(test_dataloader)
    with torch.no_grad():
        test_loop = tqdm(test_dataloader, total=len(test_dataloader), leave=True)
        test_loop.set_description("Test")
        
        for j, (xb, yb) in enumerate(test_loop, start=1):
            xb = xb.to(device)
            yb = yb.to(device).long()
    
            logits, _, _ = model(xb)
            loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))
            test_loss += loss.item() * xb.size(0)
    
            preds = torch.argmax(logits, dim=-1)
            test_correct += (preds == yb).sum().item()
            test_total += yb.numel()
    
            test_loop.set_postfix(loss=test_loss/len(test_loop), acc=f"{(test_correct/test_total):.4f}")
    
    test_loss = test_loss / test_len
    test_acc = test_correct / test_total
    history["test_loss"].append(test_loss)
    history["test_acc"].append(test_acc)