In [1]:
from utils import (
    show_images_in_grid,
    get_weight_vector,
    load_state,
    save_state,
    train_model
)

from dataset import ExoNetDatasetCnn, ExoNetDatasetV2

import os
import gc
import time
import math
import torch
import hashlib
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F


from PIL import Image
from tqdm import tqdm
from itertools import product
from torchvision import models
from torch.utils.data import DataLoader
from torchvision.io import read_image
from torchvision.transforms import v2 as transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Define seed for random
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x74df877c7750>

# Read Datasets

In [3]:
# Set directories and file paths
DATASET_DIR = "/mnt/f/ExoNet_Images/ExoNet_Images"
DATAFRAMES_DIR = "/mnt/f/Datasets/Tesis/"

In [4]:
seq_train_df = pd.read_pickle(os.path.join(DATAFRAMES_DIR, "seq_train_df.pkl"))
seq_val_df = pd.read_pickle(os.path.join(DATAFRAMES_DIR, "seq_val_df.pkl"))
test_df = pd.read_pickle(os.path.join(DATAFRAMES_DIR, "test_df.pkl"))

In [5]:
seq_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639121 entries, 0 to 639120
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   video     639121 non-null  object
 1   frame     639121 non-null  int64 
 2   class     639121 non-null  object
 3   path      639121 non-null  object
 4   sequence  639121 non-null  object
dtypes: int64(1), object(4)
memory usage: 24.4+ MB


# Create transforms

In [6]:
# Transforms
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
MEAN_DATASET = [0.485, 0.456, 0.406]
STD_DATASET = [0.229, 0.224, 0.225]

unique_labels = seq_train_df['class'].unique()
encoder = LabelEncoder()
encoder.fit(unique_labels)

train_image_transform = transforms.Compose([
    transforms.Resize((IMAGE_WIDTH, IMAGE_HEIGHT)),
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(
        mean=MEAN_DATASET,
        std=STD_DATASET
    ),
    # transforms.ColorJitter(
    #     brightness=0.2,
    #     contrast=0.2,
    #     saturation=0.2,
    #     hue=0.05
    # ),
])

val_image_transform = transforms.Compose([
    transforms.Resize((IMAGE_WIDTH, IMAGE_HEIGHT)),
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(
        mean=MEAN_DATASET,
        std=STD_DATASET
    )
])

test_image_transform = transforms.Compose([
    transforms.Resize((IMAGE_WIDTH, IMAGE_HEIGHT)),
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(
        mean=MEAN_DATASET,
        std=STD_DATASET
    )
])

# Create dataset objects for CNN training

In [7]:
# Datasets objects
# Sequence length is 15. This value was used to generate sequence of images.
# If the secuence value is modified, the dataset pre-processing must be modified also.
# SEQUENCE_LENGTH = 15
training_dataset_cnn = ExoNetDataset(
    df_labels=seq_train_df,
    seq_len=SEQUENCE_LENGTH,
    target_transform=encoder.transform,
    transform=train_image_transform
)
validation_dataset_cnn = ExoNetDataset(
    df_labels=seq_val_df,
    seq_len=SEQUENCE_LENGTH,
    target_transform=encoder.transform,
    transform=val_image_transform
)
test_dataset = ExoNetDataset(
    df_labels=seq_val_df,
    seq_len=1,
    target_transform=encoder.transform,
    transform=test_image_transform
)

print(f"[Training] Total video chunks of {SEQUENCE_LENGTH} frames: {len(training_dataset)}")
print(f"[Validation] Total video chunks of {SEQUENCE_LENGTH} frames: {len(validation_dataset)}")
print(f"[Test] Total video chunks of {1} frames: {len(test_dataset)}")

frames_tensor, labels_tensor = training_dataset[0]
print(f"Frames shape: {frames_tensor.shape}. Labels shape: {labels_tensor.shape}")


NameError: name 'ExoNetDataset' is not defined

# Train CNN feature extractors

In [7]:
# Checking if encoder.classes_ and classes obtained after applying count method to df are in the same order
count_class_train = seq_train_df.groupby(['class'])['frame'].count()
classes_from_df = list(count_class_train.to_dict().keys())
print("Are classes equal? Answer:", str(classes_from_df).replace(',','') == str(encoder.classes_).replace('\n',''))
print(count_class_train.to_dict())
train_loss_weights = get_weight_vector(count_class_train.to_dict())

Are classes equal? Answer: True
{'DS-T-LG': 20240, 'DW-S': 24272, 'DW-T-O': 12733, 'IS-S': 12754, 'IS-T-DW': 21516, 'IS-T-LG': 7696, 'LG-S': 65353, 'LG-T-DS': 16212, 'LG-T-DW': 252116, 'LG-T-IS': 18782, 'LG-T-O': 101762, 'LG-T-SE': 85685}




In [8]:
count_class_val = seq_val_df.groupby(['class'])['frame'].count()
print(count_class_val)

class
DS-T-LG     4847
DW-S        6834
DW-T-O      4436
IS-S        2670
IS-T-DW     5688
IS-T-LG     1756
LG-S        9118
LG-T-DS     3342
LG-T-DW    80693
LG-T-IS     3976
LG-T-O     29926
LG-T-SE    21400
Name: frame, dtype: int64


In [9]:
count_class_test = test_df.groupby(['class'])['frame'].count()
print(count_class_test)

class
DS-T-LG     3590
DW-S        5601
DW-T-O      1923
IS-S        1933
IS-T-DW     4424
IS-T-LG     1588
LG-S        3014
LG-T-DS     3047
LG-T-DW    46280
LG-T-IS     3298
LG-T-O     21322
LG-T-SE    12255
Name: frame, dtype: int64


In [10]:
def setup_cnn_base(
    model,
    output_dropout,
    device,
    num_classes,
    in_features,
    finetuning=False,
    vgg=False
):
    # Freeze convolutional base
    if not finetuning:
        for param in model.parameters():
            param.requires_grad = False
    
    # Replace final classification head
    if not vgg:
        model.classifier = nn.Sequential(
            nn.Linear(in_features, 2048),
            nn.ReLU(),
            nn.Dropout(output_dropout),
            nn.Linear(2048, num_classes)
        )
    else:
        model.classifier[6] = nn.Linear(in_features, num_classes)
    
    model = model.to(device)
    
    return model

In [11]:
# efficient_net
efficientnet_base = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
efficientnet_base = setup_cnn_base(
    model=efficientnet_base,
    output_dropout=0.5,
    device=torch.accelerator.current_accelerator(),
    num_classes=len(unique_labels),
    in_features=efficientnet_base.classifier[1].in_features,
    finetuning=True
)

# mobilenet_v2
mobnet_base = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V2)
mobnet_base = setup_cnn_base(
    model=mobnet_base,
    output_dropout=0.5,
    device=torch.accelerator.current_accelerator(),
    num_classes=len(unique_labels),
    in_features=mobnet_base.classifier[1].in_features,
    finetuning=True
)
# VGG16
vgg16_base = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16_base = setup_cnn_base(
    model=vgg16_base,
    output_dropout=0.5,
    device=torch.accelerator.current_accelerator(),
    num_classes=len(unique_labels),
    in_features=vgg16_base.classifier[6].in_features,
    finetuning=True,
    vgg=True
)

In [12]:
# Train MobileNetV2 model
mobnet_train_dataset = ExoNetDatasetCnn(
    df_labels=seq_train_df,
    transform=train_image_transform,
    target_transform=encoder.transform
)
mobnet_val_dataset = ExoNetDatasetCnn(
    df_labels=seq_val_df,
    transform=val_image_transform,
    target_transform=encoder.transform
)
mobnet_train_dataloader = DataLoader(
    mobnet_train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    prefetch_factor=6
)
mobnet_val_dataloader = DataLoader(
    mobnet_val_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    prefetch_factor=12
)

lr = 1e-4
mobnet_criterion = nn.CrossEntropyLoss(weight=train_loss_weights)
mobnet_optimizer = torch.optim.SGD(mobnet_base.parameters(), lr=lr, momentum=0.9)
mobnet_checkpoint_path = f"/mnt/f/Checkpoints/Tesis/cnn/mobnet_base.pth"
mobnet_checkpoint_path_best = f"/mnt/f/Checkpoints/Tesis/cnn/mobnet_base_best.pth"

mobnet_checkpoint = load_state(mobnet_checkpoint_path, device=torch.accelerator.current_accelerator())
current_epoch = 1
current_history = None
if mobnet_checkpoint:
    mobnet_base.load_state_dict(mobnet_checkpoint['model_state_dict'])
    mobnet_optimizer.load_state_dict(mobnet_checkpoint['optimizer_state_dict'])
    current_epoch = mobnet_checkpoint['current_epoch']
    current_history = mobnet_checkpoint['history']


mobnet_model, mobnet_history, mobnet_val_loss = train_model(
    model=mobnet_base,
    train_dataloader=mobnet_train_dataloader,
    val_dataloader=mobnet_val_dataloader,
    criterion=mobnet_criterion,
    optimizer=mobnet_optimizer,
    checkpoint_path=mobnet_checkpoint_path,
    checkpoint_path_best=mobnet_checkpoint_path_best,
    epochs=30,
    device=torch.accelerator.current_accelerator(),
    debug=False,
    current_epoch=current_epoch,
    current_history=current_history
)

Epoch [30/30] - Proc: 1/1:   0%|                                                               | 0/4994 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [12]:
# Train EfficientNet model
efficient_train_dataset = ExoNetDatasetCnn(
    df_labels=seq_train_df,
    transform=train_image_transform,
    target_transform=encoder.transform
)
# frame, label = efficient_train_dataset[0]
# frame = frame.unsqueeze(0)
# label = label.unsqueeze(0)
# efficientnet_base.eval()
# logits = efficientnet_base(frame.to("cuda"))
# print(logits.shape, label.shape)
efficient_val_dataset = ExoNetDatasetCnn(
    df_labels=seq_val_df,
    transform=val_image_transform,
    target_transform=encoder.transform
)
efficient_train_dataloader = DataLoader(
    efficient_train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=7,
    pin_memory=True,
    prefetch_factor=4
)
efficient_val_dataloader = DataLoader(
    efficient_val_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    prefetch_factor=4
)
lr = 1e-4
efficient_criterion = nn.CrossEntropyLoss(weight=train_loss_weights)
efficient_optimizer = torch.optim.SGD(efficientnet_base.parameters(), lr=lr, momentum=0.9)
efficient_checkpoint_path = f"/mnt/f/Checkpoints/Tesis/cnn/efficient_net_b0_base.pth"
efficient_checkpoint_path_best = f"/mnt/f/Checkpoints/Tesis/cnn/efficient_net_b0_base_best.pth"

efficient_checkpoint = load_state(efficient_checkpoint_path, device=torch.accelerator.current_accelerator())
if checkpoint:
    efficientnet_base.load_state_dict(efficient_checkpoint['model_state_dict'])
    efficient_optimizer.load_state_dict(efficient_checkpoint['optimizer_state_dict'])
    current_epoch = efficient_checkpoint['current_epoch']
    current_history = efficient_checkpoint['history']

efficient_model, efficient_history, efficient_val_loss = train_model(
    model=efficientnet_base,
    train_dataloader=efficient_train_dataloader,
    val_dataloader=efficient_val_dataloader,
    criterion=efficient_criterion,
    optimizer=efficient_optimizer,
    checkpoint_path=efficient_checkpoint_path,
    checkpoint_path_best=efficient_checkpoint_path_best,
    epochs=30,
    device=torch.accelerator.current_accelerator(),
    debug=False,
    current_epoch=current_epoch,
    current_history=current_history
)

Epoch [6/30] - Proc: 1/1: 100%|█████████████████████████████| 4994/4994 [24:28<00:00,  3.40it/s, acc=0.6567, loss=0.885]
Validation: 100%|████████████████████████████████████████████| 1365/1365 [05:34<00:00,  4.08it/s, acc=0.5843, loss=1.49]
Epoch [7/30] - Proc: 1/1: 100%|█████████████████████████████| 4994/4994 [23:54<00:00,  3.48it/s, acc=0.6894, loss=0.726]
Validation: 100%|████████████████████████████████████████████| 1365/1365 [05:51<00:00,  3.88it/s, acc=0.5667, loss=1.59]
Epoch [8/30] - Proc: 1/1: 100%|█████████████████████████████| 4994/4994 [24:10<00:00,  3.44it/s, acc=0.7152, loss=0.619]
Validation: 100%|████████████████████████████████████████████| 1365/1365 [05:51<00:00,  3.88it/s, acc=0.6053, loss=1.77]
Epoch [9/30] - Proc: 1/1: 100%|█████████████████████████████| 4994/4994 [24:24<00:00,  3.41it/s, acc=0.7360, loss=0.537]
Validation: 100%|████████████████████████████████████████████| 1365/1365 [05:57<00:00,  3.82it/s, acc=0.5822, loss=1.87]
Epoch [10/30] - Proc: 1/1: 100%|

# CNN + LSTM

In [13]:
# load best models
best_efficientnet_params = load_state("/mnt/f/Checkpoints/Tesis/cnn/efficient_net_b0_base.pth", device="cuda")
best_mobilenet_params = load_state("/mnt/f/Checkpoints/Tesis/cnn/mobnet_base_best.pth", device="cuda")

efficientnet_best_model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
efficientnet_best_model = setup_cnn_base(
    model=efficientnet_best_model,
    output_dropout=0.5,
    device=torch.accelerator.current_accelerator(),
    num_classes=len(unique_labels),
    in_features=efficientnet_best_model.classifier[1].in_features
)
mobilenet_best_model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V2)
mobilenet_best_model = setup_cnn_base(
    model=mobilenet_best_model,
    output_dropout=0.5,
    device=torch.accelerator.current_accelerator(),
    num_classes=len(unique_labels),
    in_features=mobilenet_best_model.classifier[1].in_features
)

efficientnet_best_model.load_state_dict(best_efficientnet_params["model_state_dict"])
mobilenet_best_model.load_state_dict(best_mobilenet_params["model_state_dict"])

<All keys matched successfully>

In [14]:
# Create cnn-lstm pipeline
class CNN_LSTM(nn.Module):
    def __init__(
        self,
        feature_extractor,
        feature_dim,
        hidden_dim,
        num_layers,
        num_classes,
        device="cuda"
    ):
        super().__init__()
        self.cnn_feature_extractor = feature_extractor
        self.cnn_feature_extractor = self.cnn_feature_extractor.to(device)

        self.lstm = nn.LSTM(
            input_size=feature_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, h_prev=None, c_prev=None):
        
        batch_size, seq_len, C, H, W = x.size()

        # Flatten for CNN feature extraction
        x = x.view(batch_size * seq_len, C, H, W)
        self.cnn_feature_extractor.eval()
        with torch.no_grad():
            features = self.cnn_feature_extractor(x)  # (batch * seq_len, feature_dim)

        # Restore sequence dimension
        features = features.view(batch_size, seq_len, -1)

        # LSTM forward pass
        if h_prev and c_prev:
            lstm_out, (h, c) = self.lstm(features, (h_prev, c_prev))
        else:
            lstm_out, (h, c) = self.lstm(features)

        # Use last time step for classification
        out = lstm_out[:, -1, :]
        out = self.fc(out)

        return out, h, c

In [15]:
# test
cnnlstm = CNN_LSTM(
    feature_extractor=mobilenet_best_model.features,
    feature_dim=(mobilenet_best_model.classifier[0].in_features * (IMAGE_HEIGHT//32) * (IMAGE_WIDTH//32)),
    hidden_dim=128,
    num_layers=1,
    num_classes=len(unique_labels),
    device=torch.accelerator.current_accelerator()
).to(torch.accelerator.current_accelerator())

cnnlstm.eval()
x = torch.randn(64,15,3,224,224).to("cuda")
output, _, _ = cnnlstm(x)
print(output.size())

torch.Size([64, 12])


In [None]:
# training
efficientlstm = CNN_LSTM(
    feature_extractor=efficientnet_best_model.features,
    feature_dim=(efficientnet_best_model.classifier[0].in_features * (IMAGE_HEIGHT//32) * (IMAGE_WIDTH//32)),
    hidden_dim=128,
    num_layers=1,
    num_classes=len(unique_labels),
    device=torch.accelerator.current_accelerator()
).to(torch.accelerator.current_accelerator())

efficientlstm_train_dataset = ExoNetDatasetV2(
    df_labels=seq_train_df,
    seq_len=15,
    transform=train_image_transform,
    target_transform=encoder.transform
)
efficientlstm_val_dataset = ExoNetDatasetV2(
    df_labels=seq_val_df,
    seq_len=15,
    transform=val_image_transform,
    target_transform=encoder.transform
)
efficientlstm_train_dataloader = DataLoader(
    efficientlstm_train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=7,
    pin_memory=True,
    prefetch_factor=4
)
efficientlstm_val_dataloader = DataLoader(
    efficientlstm_val_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    prefetch_factor=4
)

lr = 1e-3
efficientlstm_criterion = nn.CrossEntropyLoss(weight=train_loss_weights)
efficientlstm_optimizer = torch.optim.SGD(efficientlstm.parameters(), lr=lr, momentum=0.9)
efficientlstm_checkpoint_path = f"/mnt/f/Checkpoints/Tesis/cnn_lstm/efficient_net_b0_base.pth"
efficientlstm_checkpoint_path_best = f"/mnt/f/Checkpoints/Tesis/cnn_lstm/efficient_net_b0_base_best.pth"
efficientlstm_current_epoch = 1
efficientlstm_current_history = None

efficientlstm_checkpoint = load_state(efficientlstm_checkpoint_path, device=torch.accelerator.current_accelerator())
if efficientlstm_checkpoint:
    efficientlstm.load_state_dict(efficientlstm_checkpoint['model_state_dict'])
    efficientlstm_optimizer.load_state_dict(efficientlstm_checkpoint['optimizer_state_dict'])
    efficientlstm_current_epoch = efficientlstm_checkpoint['current_epoch']
    efficientlstm_current_history = efficientlstm_checkpoint['history']

efficientlstm_model, efficientlstm_history, efficientlstm_val_loss = train_model(
    model=efficientlstm,
    train_dataloader=efficientlstm_train_dataloader,
    val_dataloader=efficientlstm_val_dataloader,
    criterion=efficientlstm_criterion,
    optimizer=efficientlstm_optimizer,
    checkpoint_path=efficientlstm_checkpoint_path,
    checkpoint_path_best=efficientlstm_checkpoint_path_best,
    epochs=30,
    device=torch.accelerator.current_accelerator(),
    debug=False,
    current_epoch=efficientlstm_current_epoch,
    current_history=efficientlstm_current_history,
    training_type="lstm"
)

Epoch [2/30] - Proc: 1/1:   0%|                           | 56/19973 [01:04<3:27:05,  1.60it/s, acc=0.9442, loss=0.0729]

## ConvLSTM n channels

In [10]:
class ConvLSTMCell(nn.Module):
    def __init__(
        self,
        input_channels,
        hidden_channels,
        kernel_size,
        bias=True,
        input_dropout=0.0,
        recurrent_dropout=0.0
    ):
        super().__init__()
        padding = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.hidden_channels = hidden_channels

        self.conv = nn.Conv2d(
            input_channels + hidden_channels,
            4 * hidden_channels,
            kernel_size,
            padding=padding,
            bias=bias,
        )
        self.input_dropout = nn.Dropout2d(input_dropout)
        self.recurrent_dropout = nn.Dropout2d(recurrent_dropout)

    def forward(self, x, h, c):
        if self.input_dropout.p > 0.0:
            x = self.input_dropout(x)
        if self.recurrent_dropout.p > 0.0:
            h = self.recurrent_dropout(h)
        
        combined = torch.cat([x, h], dim=1)
        conv_out = self.conv(combined)
        cc_i, cc_f, cc_o, cc_g = torch.chunk(conv_out, 4, dim=1)

        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)

        c_next = f * c + i * g
        h_next = o * torch.tanh(c_next)
        return h_next, c_next

In [11]:
class ConvLSTM(nn.Module):
    def __init__(
        self,
        input_channels,
        hidden_channels_list,
        kernel_size,
        num_classes,
        device,
        input_dropout=0.0,
        recurrent_dropout=0.0,
        output_dropout=0.0,
    ):
        super().__init__()
        self.num_layers = len(hidden_channels_list)

        layers = []
        for i in range(self.num_layers):
            in_channels = input_channels if i == 0 else hidden_channels_list[i - 1]
            layers.append(
                ConvLSTMCell(
                    in_channels,
                    hidden_channels_list[i],
                    kernel_size[i],
                    input_dropout=input_dropout,
                    recurrent_dropout=recurrent_dropout
                ).to(device)
            )
        self.layers = nn.ModuleList(layers)
        self.avg_pool_2d = nn.AdaptiveAvgPool2d(output_size=(1,1))
        self.classifier = nn.Linear(hidden_channels_list[-1], num_classes)
        self.output_dropout = nn.Dropout2d(output_dropout)

    def forward(self, x, h=None, c=None):
        # x: (B, T, C, H, W) -> feature maps from EfficientNet
        B, T, _, H, W = x.size()

        if h is None:
            h = [torch.zeros(B, ch, H, W, device=x.device) for ch in self._hidden_channels]
            c = [torch.zeros(B, ch, H, W, device=x.device) for ch in self._hidden_channels]

        outputs = []
        for t in range(T): # For each timestep in the sequence
            input_t = x[:, t]

            for layer_idx, cell in enumerate(self.layers):
                h[layer_idx], c[layer_idx] = cell(input_t, h[layer_idx], c[layer_idx])
                input_t = h[layer_idx]
            
        dropped = self.output_dropout(h[-1])
        pooled = self.avg_pool_2d(dropped)  # (B, C_hidden, 1, 1)
        pooled = pooled.view(B, -1)  # (B, C_hidden)

        frame_output = self.classifier(pooled)  # (B, num_classes)
        outputs.append(frame_output)

        outputs = torch.stack(outputs, dim=1)  # (B, T, num_classes)
        return outputs, h, c

    @property
    def _hidden_channels(self):
        return [cell.hidden_channels for cell in self.layers]


In [12]:
class CnnExtractorConvLSTM(nn.Module):
    def __init__(
        self,
        hidden_channels_list,
        kernel_size,
        num_classes,
        device,
        feature_extractor,
        feature_dim,
        name,
        input_dropout,
        recurrent_dropout,
        output_dropout
    ):
        super().__init__()
        self.name = name
        self.feature_extractor = feature_extractor
        self.feature_extractor = self.feature_extractor.to(device=device)

        # ConvLSTM on top of feature maps
        self.convlstm = ConvLSTM(
            input_channels=feature_dim,
            hidden_channels_list=hidden_channels_list,
            kernel_size=kernel_size,
            num_classes=num_classes,
            input_dropout=input_dropout,
            recurrent_dropout=recurrent_dropout,
            output_dropout=output_dropout,
            device=device
        ).to(device)

    def forward(self, x, h=None, c=None):
        # x: (B, T, 3, H, W) raw RGB frames
        B, T, C, H, W = x.size()

        features = []
        for t in range(T):
            with torch.no_grad():
                f_t = self.feature_extractor(x[:, t])  # (B, 1280, H', W')
                features.append(f_t)

        features = torch.stack(features, dim=1)  # (B, T, C_feat, H', W')
        
        out, h_prev, c_prev = self.convlstm(features, h, c)  # (B, T, num_classes)
        return out, h_prev, c_prev


In [13]:
def test(m_test, batch_size, seq_length, channels=3, heigth=224, weigth=224):
    m_test.eval()
    x = torch.randn(batch_size, seq_length, 3, heigth, weigth).to("cuda")
    out, h, c = m_test(x)
    out, h, c = m_test(x, h, c)
    print(out.shape, len(h), len(c))

## EfficientNetB0 with convLSTM

In [14]:
efficientnet_convLSTM = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
efficientnet_convLSTM_dim = 1280
efficientnet_name = "EfficientNetB0_convLSTM"

In [15]:
# test
m_test = CnnExtractorConvLSTM(
    num_classes=len(unique_labels),
    hidden_channels_list=[128, 64],
    kernel_size=[(3,3),(3,3)],
    device="cuda",
    feature_extractor=efficientnet_convLSTM.features,
    feature_dim=efficientnet_convLSTM_dim,
    name=efficientnet_name,
    input_dropout=0.2,
    recurrent_dropout=0.1,
    output_dropout=0.3
).to(device="cuda")
test(m_test, 64, SEQUENCE_LENGTH)
test(m_test, 1, 1)

torch.Size([64, 1, 12]) 2 2
torch.Size([1, 1, 12]) 2 2


## MobileNetV2 with convLSTM

In [16]:
mobnet_convLSTM = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
mobnet_convLSTM_dim = 1280
mobnet_name = "MobileNetV2_convLSTM"

In [17]:
# test
m_test = CnnExtractorConvLSTM(
    num_classes=len(unique_labels),
    hidden_channels_list=[128, 64],
    kernel_size=[(3,3),(3,3)],
    device="cuda",
    feature_extractor=mobnet_convLSTM.features,
    feature_dim=mobnet_convLSTM_dim,
    name=mobnet_name,
    input_dropout=0.2,
    recurrent_dropout=0.1,
    output_dropout=0.3
).to(device="cuda")
test(m_test, 64, SEQUENCE_LENGTH)
test(m_test, 1, 1)

torch.Size([64, 1, 12]) 2 2
torch.Size([1, 1, 12]) 2 2


## VGG16 with convLSTM

In [18]:
vgg16_convLSTM = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16_convLSTM_dim = 512
vgg16_name = "VGG16_convLSTM"

In [19]:
# test
m_test = CnnExtractorConvLSTM(
    num_classes=len(unique_labels),
    hidden_channels_list=[128, 64],
    kernel_size=[(3,3),(3,3)],
    device="cuda",
    feature_extractor=vgg16_convLSTM.features,
    feature_dim=vgg16_convLSTM_dim,
    name=vgg16_name,
    input_dropout=0.2,
    recurrent_dropout=0.1,
    output_dropout=0.3
).to(device="cuda")
test(m_test, 64, SEQUENCE_LENGTH)
test(m_test, 1, 1)

torch.Size([64, 1, 12]) 2 2
torch.Size([1, 1, 12]) 2 2


# Training

In [20]:
# Checking if encoder.classes_ and classes obtained after applying count method to df are in the same order
count_class_train = seq_train_df.groupby(['class'])['frame'].count()
classes_from_df = list(count_class_train.to_dict().keys())
print("Are classes equal? Answer:", str(classes_from_df).replace(',','') == str(encoder.classes_).replace('\n',''))
print(count_class_train.to_dict())
train_loss_weights = get_weight_vector(count_class_train.to_dict())

Are classes equal? Answer: True
{'DS-T-LG': 2063, 'DW-S': 1950, 'DW-T-O': 612, 'IS-S': 1206, 'IS-T-DW': 1615, 'IS-T-LG': 490, 'LG-S': 4640, 'LG-T-DS': 1151, 'LG-T-DW': 28413, 'LG-T-IS': 1147, 'LG-T-O': 5504, 'LG-T-SE': 4555}




In [21]:
# define params grid
batch_sizes = [128]
device = "cuda"
lr = 1e-3
params = {
    "batches": batch_sizes,
    "epochs": [ 30 ],
    "dropout": [
        [0.2, 0.1, 0.2]
    ], # input, recurrent, output
    "layers": [[128, 64, 32]],
}
model_names = ['MobileNetV2_convLSTM', 'EfficientNet_convLSTM', 'VGG16_convLSTM']

In [22]:
model_params = list(product(model_names, *params.values()))
best_results = {
    'EfficientNet_convLSTM': {
        'val_loss': float('inf'),
        'params': None
    },
    'MobileNetV2_convLSTM': {
        'val_loss': float('inf'),
        'params': None
    },
    'VGG16_convLSTM': {
        'val_loss': float('inf'),
        'params': None
    }
}
checkpoints_prefixes = dict()

for i, (model_name, *combo_params) in enumerate(model_params, start=1):
    if model_name == 'EfficientNet_convLSTM':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor=efficientnet_convLSTM.features,
            feature_dim=efficientnet_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")
    if model_name == 'MobileNetV2_convLSTM':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor= mobnet_convLSTM.features,
            feature_dim=mobnet_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")
    if model_name == 'VGG16_convLSTM':
        model = CnnExtractorConvLSTM(
            num_classes=len(unique_labels),
            hidden_channels_list=combo_params[3],
            kernel_size=[(3,3) for _ in combo_params[3]],
            device="cuda",
            feature_extractor= vgg16_convLSTM.features,
            feature_dim=vgg16_convLSTM_dim,
            name=model_name,
            input_dropout=combo_params[2][0],
            recurrent_dropout=combo_params[2][1],
            output_dropout=combo_params[2][2]
        ).to(device="cuda")

    criterion = nn.CrossEntropyLoss(weight=train_loss_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    checkpoint_prefix = hash(str(combo_params))
    checkpoints_prefixes[f'{checkpoint_prefix}'] = combo_params
    torch.save(checkpoints_prefixes, f"/mnt/f/Checkpoints/Tesis/checkpoint_prefixes.pth")
    checkpoint_path = f"/mnt/f/Checkpoints/Tesis/{model_name}_{checkpoint_prefix}.pth"
    checkpoint_path_best = f"/mnt/f/Checkpoints/Tesis/{model_name}_{checkpoint_prefix}_best.pth"
    
    train_dataset = ExoNetDataset(
        df_labels=seq_train_df,
        seq_len=SEQUENCE_LENGTH,
        target_transform=encoder.transform,
        transform=train_image_transform
    )
    val_dataset = ExoNetDataset(
        df_labels=seq_val_df,
        seq_len=SEQUENCE_LENGTH,
        target_transform=encoder.transform,
        transform=val_image_transform
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=combo_params[0],
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=combo_params[0],
        shuffle=False,
        num_workers=6,
        pin_memory=True
    )

    model, history, val_loss = train_model(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        checkpoint_path=checkpoint_path,
        checkpoint_path_best=checkpoint_path_best,
        epochs=combo_params[1],
        device="cuda",
        n_processes=len(model_params),
        i_process=i
    )
    if val_loss < best_results[model_name]['val_loss']:
        best_results[model_name]['val_loss'] = val_loss
        best_results[model_name]['params'] = combo_params
        
    torch.save(best_results, f"/mnt/f/Checkpoints/Tesis/best_results.pth")

Epoch [1/30] - Proc: 1/3:   4%|█▎                               | 16/417 [01:31<38:11,  5.71s/it, acc=0.0410, loss=12.2]


KeyboardInterrupt: 