In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import torch

TORCH = torch.__version__.split('+')[0]
CUDA = 'cu' + torch.version.cuda.replace('.', '')

# Construct the installation command
install_command = f"pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html"

# Execute the command
!{install_command}

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html


In [3]:
import os
import torch
import numpy as np

# Set a random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
import pandas as pd

DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/DGMD E-14 Project/Datasets/asl-signs"

# Train DataFrame
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")

# Define the list of signs to include
selected_signs = [
    'TV', 'after', 'airplane', 'all', 'alligator', 'bird', 'callonphone', 'cry',
    'dad', 'dance', 'dog', 'drink', 'duck', 'elephant', 'eye', 'feet', 'finger',
    'flower', 'food', 'face'
]

# Filter the DataFrame to include only the selected signs
train_df = train_df[train_df['sign'].isin(selected_signs)]


display(train_df.head())
display(train_df.info())

Unnamed: 0,path,participant_id,sequence_id,sign
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
5,train_landmark_files/26734/1000241583.parquet,26734,1000241583,duck
8,train_landmark_files/37055/100035691.parquet,37055,100035691,flower
39,train_landmark_files/53618/1001896056.parquet,53618,1001896056,finger
41,train_landmark_files/53618/100190623.parquet,53618,100190623,cry


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7641 entries, 3 to 94474
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   path            7641 non-null   object
 1   participant_id  7641 non-null   int64 
 2   sequence_id     7641 non-null   int64 
 3   sign            7641 non-null   object
dtypes: int64(2), object(2)
memory usage: 298.5+ KB


None

In [5]:
# Getting unique 'sign' values and their counts
CLASSES = train_df['sign'].unique()
NUMBER_OF_CLASSES = len(CLASSES)
# Count the occurrences of each sign in the dataset
sign_counts = train_df['sign'].value_counts()



# Calculate min, max, and average counts
min_sign_count = sign_counts.min()
max_sign_count = sign_counts.max()
average_sign_count = sign_counts.mean()

print("Count of unique signs:", NUMBER_OF_CLASSES)
print("Minimum number of examples per sign:", min_sign_count)
print("Maximum number of examples per sign:", max_sign_count)
print("Average number of examples per sign:", average_sign_count)

Count of unique signs: 20
Minimum number of examples per sign: 312
Maximum number of examples per sign: 405
Average number of examples per sign: 382.05


In [8]:
import random
import math
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pyarrow.parquet as pq
from sklearn.preprocessing import LabelEncoder

class ASLSignsDataset(Dataset):
    def __init__(self, dataframe, root_dir, rows_per_frame=543):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.label_encoder = LabelEncoder()
        self.dataframe['encoded_labels'] = self.label_encoder.fit_transform(dataframe['sign'])
        self.rows_per_frame = rows_per_frame

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        file_path = os.path.join(self.root_dir, self.dataframe.iloc[idx]['path'])
        landmarks_df = pq.read_table(file_path).to_pandas()

        # Load relevant data subset and reshape
        landmarks_tensor = self._load_relevant_data_subset(landmarks_df, MAX_LEN)

        # Print the shape of the data for debugging
        # print("Original landmarks shape:", landmarks_tensor.shape)

        # Preprocess the data
        landmarks_tensor = self._preprocess(landmarks_tensor)
        # print(f"Shape after preprocessing: {landmarks_tensor.shape}")

        # Debugging: Print the shape after preprocessing
        # print("Post-preprocessing shape:", landmarks_tensor.shape)

        # Apply data augmentation (if needed)
        # print("Pre-augmentation shape:", landmarks_tensor.shape)
        # augmented_landmarks = self._augment(landmarks_tensor)
        # print("Post-augmentation shape:", augmented_landmarks.shape)

        # # Debugging: Print the shape after augmentation
        # print("Post-augmentation shape:", augmented_landmarks.shape)

        # Encode labels for the filtered signs
        label = self.label_encoder.transform([self.dataframe.iloc[idx]['sign']])[0]

        return landmarks_tensor, label

    def _load_relevant_data_subset(self, df, max_len):
        n_frames = int(len(df) / self.rows_per_frame)
        landmarks = df[['x', 'y', 'z']].values.reshape(n_frames, self.rows_per_frame, -1)

        # Convert landmarks to a PyTorch tensor
        landmarks = torch.tensor(landmarks, dtype=torch.float32)

        # Padding logic
        if n_frames < max_len:
            pad_size = max_len - n_frames
            padding = torch.zeros((pad_size, self.rows_per_frame, landmarks.shape[2]))
            landmarks = torch.cat([landmarks, padding], dim=0)

        return landmarks

    def _preprocess(self, landmarks):
        # Assuming landmarks shape: [sequence_length, ROWS_PER_FRAME, channels]
        sequence_length, num_landmarks, channels = landmarks.shape

        # Normalization around the nose landmark
        nose_mean = landmarks[:, 17, :2].mean(dim=1, keepdim=True)
        nose_mean = nose_mean.unsqueeze(1).expand(-1, num_landmarks, -1)
        landmarks.sub_(nose_mean)  # In-place subtraction

        # Select specific landmarks based on POINT_LANDMARKS
        selected_landmarks = landmarks[:, POINT_LANDMARKS, :]

        # Calculate first and second derivatives
        dx = torch.diff(selected_landmarks, dim=1, prepend=selected_landmarks[:, :1, :])
        dx2 = torch.diff(dx, dim=1, prepend=dx[:, :1, :])

        # Concatenation of original, dx, and dx2
        processed_landmarks = torch.cat([selected_landmarks, dx, dx2], dim=-1)

        # Handle NaN values
        processed_landmarks = torch.nan_to_num(processed_landmarks)

        # Flattening the last two dimensions
        processed_landmarks = processed_landmarks.view(sequence_length, -1)
        return processed_landmarks

    def _augment(self, landmarks, max_len=None):
        print(f"Augment: Before - {landmarks.shape}")
        # Apply augmentations with random chance

        # Debugging: Print the shape before resampling
        # print("Pre-resample shape:", landmarks.shape)

        # Apply resampling with a probability of 0.8
        if random.random() < 0.8:
            landmarks = self._resample(landmarks)

        # Debugging: Print the shape after resampling
        # print("Post-resample shape:", landmarks.shape)

        # Apply flip_lr with a probability of 0.5
        if random.random() < 0.5:
            landmarks = self._flip_lr(landmarks)

        # Apply spatial_random_affine with a probability of 0.75
        if random.random() < 0.75:
            landmarks = self._spatial_random_affine(landmarks)

        # Apply temporal_crop if max_len is provided
        if max_len is not None:
            landmarks = self._temporal_crop(landmarks, max_len=max_len)

        # Apply temporal_mask with a probability of 0.5
        if random.random() < 0.5:
            landmarks = self._temporal_mask(landmarks)

        # Apply spatial_mask with a probability of 0.5
        if random.random() < 0.5:
            landmarks = self._spatial_mask(landmarks)

        print(f"Augment: After - {landmarks.shape}")

        return landmarks

    def _resample(self, landmarks):
        # Determine the current and target sequence lengths
        current_length = landmarks.size(0)
        rate = random.uniform(0.8, 1.2)
        target_length = int(current_length * rate)

        # Reshape the landmarks for interpolation
        # Change shape to [batch, channels, sequence_length]
        # Here channels will be ROWS_PER_FRAME * 3
        landmarks = landmarks.reshape(1, -1, current_length)

        # Apply interpolation
        landmarks = F.interpolate(landmarks, size=(target_length,), mode='nearest')

        # Reshape back to the original format [sequence_length, ROWS_PER_FRAME, 3]
        landmarks = landmarks.reshape(target_length, -1, 3)

        return landmarks

    def _flip_lr(self, landmarks):
        # Reshape the tensor from [sequence_length, 1629] to [sequence_length, 543, 3]
        landmarks = landmarks.view(landmarks.size(0), -1, 3)

        # Flip the x-coordinate
        landmarks[:, :, 0] = 1 - landmarks[:, :, 0]

        # Swap specific landmarks
        landmarks = self._swap_landmarks(landmarks, LHAND, RHAND)
        landmarks = self._swap_landmarks(landmarks, LLIP, RLIP)
        landmarks = self._swap_landmarks(landmarks, LPOSE, RPOSE)
        landmarks = self._swap_landmarks(landmarks, LEYE, REYE)
        landmarks = self._swap_landmarks(landmarks, LNOSE, RNOSE)

        # Reshape the tensor back to [sequence_length, 1629]
        landmarks = landmarks.view(landmarks.size(0), -1)

        return landmarks

    def _swap_landmarks(self, landmarks, set1, set2):
        # Swap two sets of landmarks
        # Note: Adjust the indices in set1 and set2 based on the new landmarks structure
        # after preprocessing. Make sure they are within the bounds of the reshaped tensor.
        for l_index, r_index in zip(set1, set2):
            if l_index < landmarks.shape[1] and r_index < landmarks.shape[1]:
                temp = landmarks[:, l_index].clone()
                landmarks[:, l_index] = landmarks[:, r_index]
                landmarks[:, r_index] = temp
            else:
                # Handle out-of-bounds indices or adjust the logic to match the new structure
                print(f"Index out of bounds: l_index={l_index}, r_index={r_index}")
        return landmarks


    def _spatial_random_affine(self, landmarks):
        # Reshape the tensor from [sequence_length, 1629] to [sequence_length, 543, 3]
        landmarks = landmarks.view(landmarks.size(0), -1, 3)

        # Define the center for rotation
        center = torch.tensor([0.5, 0.5])

        for i in range(landmarks.shape[0]):  # Iterate through each frame
            frame = landmarks[i, :, :2]  # Select x and y coordinates

            # Apply scaling
            scale = random.uniform(0.8, 1.2)
            frame = frame * scale

            # Apply shearing
            shear_x = shear_y = random.uniform(-0.15, 0.15)
            if random.random() < 0.5:
                shear_x = 0.
            else:
                shear_y = 0.
            shear_mat = torch.tensor([[1., shear_x], [shear_y, 1.]])
            frame = torch.matmul(frame, shear_mat)
            center += torch.tensor([shear_y, shear_x])

            # Apply rotation
            degree = random.uniform(-30, 30)
            radian = degree * math.pi / 180
            cos_val, sin_val = math.cos(radian), math.sin(radian)
            rotate_mat = torch.tensor([[cos_val, -sin_val], [sin_val, cos_val]])
            frame = frame - center
            frame = torch.matmul(frame, rotate_mat)
            frame = frame + center

            # Apply translation
            shift_val = random.uniform(-0.1, 0.1)
            frame += shift_val

            landmarks[i, :, :2] = frame  # Update the transformed frame

        # Reshape the tensor back to [sequence_length, 1629]
        landmarks = landmarks.view(landmarks.size(0), -1)

        return landmarks


    def _temporal_crop(self, landmarks, max_len=384):
        # Temporal cropping of the data
        # landmarks shape: [sequence_length, ROWS_PER_FRAME, 3]

        sequence_length = landmarks.shape[0]

        # If the sequence is shorter than the max_len, return it as is
        if sequence_length <= max_len:
            return landmarks

        # Randomly choose a start point for the crop
        start = random.randint(0, sequence_length - max_len)
        end = start + max_len

        # Crop the sequence
        cropped_landmarks = landmarks[start:end, :, :]

        return cropped_landmarks

    def _temporal_mask(self, landmarks, mask_size_range=(0.2, 0.4), mask_value=float('nan')):
        # Temporal masking of the data
        # landmarks shape after preprocessing and augmentation: [sequence_length, channels]
        # We need to ensure the sequence_length dimension is correctly handled.

        # Handle cases where landmarks might have an additional dimension (like after augmentation)
        if landmarks.ndim > 2:
            sequence_length = landmarks.size(0)
        else:
            sequence_length, _ = landmarks.shape

        # Randomly determine the size of the mask
        mask_size_fraction = random.uniform(*mask_size_range)
        mask_size = int(sequence_length * mask_size_fraction)

        # Randomly choose a start point for the mask
        if sequence_length - mask_size > 0:
            start = random.randint(0, sequence_length - mask_size)
        else:
            start = 0

        # Apply the mask
        if landmarks.ndim > 2:
            landmarks[start:start + mask_size, :, :] = mask_value
        else:
            landmarks[start:start + mask_size, :] = mask_value

        return landmarks

    def _spatial_mask(self, landmarks, size=(0.2, 0.4), mask_value=float('nan')):
        # landmarks shape after preprocessing and augmentation: [sequence_length, channels]
        if landmarks.ndim > 2:
            # Additional reshaping logic if the tensor has more than 2 dimensions
            landmarks = landmarks.view(landmarks.size(0), -1)

        sequence_length, channels = landmarks.shape
        num_landmarks = channels // 3  # Assuming each landmark has x, y, z coordinates

        # Randomly determine the size and offset of the mask
        mask_size = random.uniform(*size)
        mask_offset_x = random.uniform(0, 1 - mask_size)
        mask_offset_y = random.uniform(0, 1 - mask_size)

        # Apply the mask
        for i in range(sequence_length):  # Iterate through each frame
            for j in range(num_landmarks):  # Iterate through each landmark
                x_index = j * 3  # Index for x-coordinate
                y_index = x_index + 1  # Index for y-coordinate

                # Check if the landmark is within the mask bounds
                x_in_mask = mask_offset_x < landmarks[i, x_index] < mask_offset_x + mask_size
                y_in_mask = mask_offset_y < landmarks[i, y_index] < mask_offset_y + mask_size

                if x_in_mask and y_in_mask:
                    # Mask this landmark
                    landmarks[i, x_index:x_index + 3] = mask_value  # Mask x, y, z coordinates

        return landmarks


In [9]:
# Instantiate the dataset
dataset = ASLSignsDataset(train_df, DATA_DIR)

# Check an example
example_landmarks, example_label = dataset[0]
print("Tensor size:", example_landmarks.size(), "Label:", example_label)

# Statistics about the dataset
total_items = len(dataset)
unique_labels = len(set(dataset.dataframe['encoded_labels']))

print("Total items in the dataset:", total_items)
print("Number of unique labels:", unique_labels)

Tensor size: torch.Size([384, 3186]) Label: 5
Total items in the dataset: 7641
Number of unique labels: 20


In [10]:
def collate_fn(batch):
    max_len = max([item[0].shape[0] for item in batch])
    # print(f"Max sequence length for padding: {max_len}")

    # Pad each sequence to the max_len
    padded_batch = []
    for x, y in batch:
        # print(f"Shape before padding: {x.shape}")
        padded_len = max_len - x.shape[0]
        padded_x = F.pad(x, (0, 0, padded_len, 0))  # Pad at the end of the sequence
        # print(f"Shape after padding: {padded_x.shape}")
        padded_batch.append((padded_x, y))

    # Stack all padded tensors
    batch_x = torch.stack([item[0] for item in padded_batch])
    batch_y = torch.tensor([item[1] for item in padded_batch])
    # print(f"Batch shape after padding and stacking: {batch_x.shape}")

    return batch_x, batch_y

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold

class CausalDWConv1D(nn.Module):
    def __init__(self, channels, kernel_size, dilation_rate):
        super(CausalDWConv1D, self).__init__()
        self.padding = (kernel_size - 1) * dilation_rate
        self.dw_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=self.padding, groups=channels, dilation=dilation_rate, bias=False)

    def forward(self, x):
        x = self.dw_conv(x)
        # Remove the padding added for causality
        return x[:, :, :-self.padding] if self.padding != 0 else x


In [12]:
class ECA(nn.Module):
    def __init__(self, kernel_size=5):
        super(ECA, self).__init__()
        self.kernel_size = kernel_size
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)

    def forward(self, x):
        # Global Average Pooling along the spatial dimensions
        y = x.mean(dim=-1, keepdim=True)

        # Adjust the dimensions for 1D convolution
        y = y.permute(0, 2, 1)  # Swap the channel and the spatial dimension

        # Convolution
        y = self.conv(y)

        # Activation
        y = torch.sigmoid(y)

        # Reverse the earlier permutation to match the input tensor's shape
        y = y.permute(0, 2, 1)

        return x * y

In [13]:
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

In [14]:
class Conv1DBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation_rate=1, drop_rate=0.0, expand_ratio=2, se_ratio=0.25, activation='relu'):
        super(Conv1DBlock, self).__init__()

        # Expansion phase (if expand_ratio is set to a value greater than 1)
        self.use_expansion = expand_ratio > 1
        if self.use_expansion:
            self.expansion_conv = nn.Conv1d(in_channels, in_channels * expand_ratio, kernel_size=1)
            self.expansion_bn = nn.BatchNorm1d(in_channels * expand_ratio)
            if activation == 'swish':
                self.expansion_activation = Swish()
            else:
                self.expansion_activation = getattr(F, activation)

        # Depthwise separable convolution
        self.dwconv = CausalDWConv1D(in_channels * expand_ratio if self.use_expansion else in_channels, kernel_size, dilation_rate)
        self.norm = nn.BatchNorm1d(in_channels * expand_ratio if self.use_expansion else in_channels)

        # Squeeze and Excitation
        self.use_se = 0 < se_ratio <= 1
        if self.use_se:
            num_squeezed_channels = max(1, int(in_channels * se_ratio))
            self.se = nn.Sequential(
                nn.AdaptiveAvgPool1d(1),
                nn.Conv1d(in_channels * expand_ratio if self.use_expansion else in_channels, num_squeezed_channels, kernel_size=1),
                nn.ReLU(),
                nn.Conv1d(num_squeezed_channels, in_channels * expand_ratio if self.use_expansion else in_channels, kernel_size=1),
                nn.Sigmoid()
            )

        # Output phase
        self.project_conv = nn.Conv1d(in_channels * expand_ratio if self.use_expansion else in_channels, out_channels, kernel_size=1)
        self.project_bn = nn.BatchNorm1d(out_channels)

        self.drop_rate = drop_rate
        if activation == 'swish':
            self.activation = Swish()
        else:
            self.activation = getattr(F, activation)

    def forward(self, x):
        identity = x

        # Expansion
        if self.use_expansion:
            x = self.expansion_conv(x)
            x = self.expansion_bn(x)
            x = self.expansion_activation(x) if self.expansion_activation else x

        # Depthwise convolution
        x = self.dwconv(x)
        x = self.norm(x)

        # Squeeze and Excitation
        if self.use_se:
            x = x * self.se(x)

        # Projection
        x = self.project_conv(x)
        x = self.project_bn(x)

        # Skip connection and drop
        if self.drop_rate > 0:
            x = F.dropout(x, p=self.drop_rate, training=self.training)
        x += identity  # skip connection
        x = self.activation(x) if self.activation else x
        return x


In [15]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.dim = dim
        self.num_heads = num_heads
        assert dim % num_heads == 0, "dimension must be divisible by the number of heads"

        self.depth = dim // num_heads

        self.wq = nn.Linear(dim, dim)
        self.wk = nn.Linear(dim, dim)
        self.wv = nn.Linear(dim, dim)

        self.dense = nn.Linear(dim, dim)
        self.scale = torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, depth)
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        # Transpose for the shape (batch_size, num_heads, seq_len, depth)
        return x.permute(0, 2, 1, 3)

    def forward(self, x):
        batch_size = x.size(0)

        q = self.split_heads(self.wq(x), batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(self.wk(x), batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(self.wv(x), batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # Scaled dot-product attention
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))  # (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention_logits = matmul_qk / self.scale

        attention_weights = F.softmax(scaled_attention_logits, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, depth)

        output = output.permute(0, 2, 1, 3).contiguous()  # (batch_size, seq_len_q, num_heads, depth)
        output = output.view(batch_size, -1, self.dim)  # (batch_size, seq_len_q, dim)

        return self.dense(output)


In [16]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, num_heads, expand_ratio, drop_rate, activation):
        super().__init__()
        self.attention = MultiHeadSelfAttention(dim, num_heads)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

        # Use activation function as a layer, not as a function
        if activation == 'swish':
            activation_layer = Swish()
        else:
            activation_layer = getattr(nn, activation)()

        self.feed_forward = nn.Sequential(
            nn.Linear(dim, dim * expand_ratio),
            activation_layer,
            nn.Linear(dim * expand_ratio, dim),
            nn.Dropout(drop_rate)
        )

    def forward(self, x):
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)

        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)

        return x


In [17]:
class ASLClassifierModel(nn.Module):
    def __init__(self, num_classes=NUMBER_OF_CLASSES, channels=96, dim=48):  # Reduced channels and dim
        super(ASLClassifierModel, self).__init__()

        self.dim = dim
        self.seq_len = 384

        self.stem_conv = nn.Conv1d(channels, dim, kernel_size=1)
        self.stem_bn = nn.BatchNorm1d(dim)

        # Reduced number of blocks
        self.blocks = nn.Sequential(
            Conv1DBlock(dim, dim, 17, dilation_rate=1, drop_rate=0.1, expand_ratio=1, se_ratio=0.25, activation='swish'),
            TransformerBlock(dim, num_heads=2, expand_ratio=2, drop_rate=0.1, activation='swish'),
            Conv1DBlock(dim, dim, 17, dilation_rate=1, drop_rate=0.1, expand_ratio=1, se_ratio=0.25, activation='swish'),
        )
        # # Simplified model structure
        # self.blocks = nn.Sequential(
        #     Conv1DBlock(dim, dim, 17, dilation_rate=1, drop_rate=0.05, expand_ratio=1, se_ratio=0.25, activation='swish'),
        #     TransformerBlock(dim, num_heads=2, expand_ratio=2, drop_rate=0.05, activation='swish')
        # )

        self.pre_attention_linear = nn.Linear(dim * 3186, self.seq_len * dim)  # Adjusted feature size
        self.top_conv = nn.Linear(dim * 2, dim * 2, bias=False)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.late_dropout = nn.Dropout(0.5)  # Reduced dropout
        self.classifier = nn.Linear(dim * 2, num_classes)

    def forward_blocks(self, blocks, x):
        for block in blocks:
            x = checkpoint.checkpoint(block, x)
        return x

    def forward(self, x):
        # Initial convolution and batch normalization
        x = self.stem_conv(x)
        x = self.stem_bn(x)
        x.relu_()  # In-place ReLU activation

        # Apply checkpointing to sequential blocks
        x = self.forward_blocks(self.blocks, x)

        # Flatten the tensor along sequence length and feature size, keeping batch size separate
        x = x.view(x.size(0), -1)  # Flatten to [batch_size, seq_len * num_features]

        # Transform the flattened tensor to the shape expected by the attention mechanism
        x = self.pre_attention_linear(x)  # Transform to [batch_size, self.seq_len * self.dim]

        # Reshape the output to match the attention mechanism's expected input shape
        x = x.view(x.size(0), self.seq_len, self.dim)

        # Apply top convolution, global average pooling, late dropout, and classifier
        x = self.top_conv(x)
        x = self.global_avg_pool(x).squeeze(-1)
        x.late_dropout_(p=0.8)  # In-place dropout
        x = self.classifier(x)

        return x

In [18]:
from sklearn.model_selection import KFold
from torch.utils.data import Subset, DataLoader
from torch.cuda.amp import GradScaler, autocast
import torch
import os

# Function to create DataLoader
def get_data_loader(dataset, batch_size=4, shuffle=True, num_workers=4, pin_memory=True):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        collate_fn=collate_fn
    )

# Function to get DataLoaders for training and validation
def get_dataloaders(full_dataset, train_idx, valid_idx, batch_size):
    train_subset = Subset(full_dataset, train_idx)
    valid_subset = Subset(full_dataset, valid_idx)

    train_loader = get_data_loader(train_subset, batch_size=batch_size, shuffle=True)
    valid_loader = get_data_loader(valid_subset, batch_size=batch_size, shuffle=False)

    return train_loader, valid_loader

# Function to train for one epoch
def train_one_epoch(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return running_loss / len(dataloader)

# Function to validate the model
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Clear GPU cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    return running_loss / len(dataloader)

# Training loop with checkpointing and early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience, fold):
    scaler = GradScaler()
    best_val_loss = float('inf')
    early_stopping_counter = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            # Mixed precision training
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            # Backpropagation with scaled loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()

            # Clear GPU cache if needed
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        avg_train_loss = running_loss / len(train_loader)

        # Validation phase
        val_loss = validate(model, val_loader, criterion)

        print(f'Epoch {epoch}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}')

        # Checkpointing
        current_checkpoint_path = os.path.join(CHECKPOINT_DIR, f'checkpoint_fold_{fold}_epoch_{epoch}.pth')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_checkpoint_path = os.path.join(CHECKPOINT_DIR, f'best_model_checkpoint_fold_{fold}.pth')
            torch.save(model.state_dict(), best_checkpoint_path)
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= patience:
            print("Early stopping triggered")
            break

    # Load the best model state for this fold
    load_best_checkpoint(model, fold)

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold

CHECKPOINT_DIR = '/content/drive/MyDrive/Colab Notebooks/DGMD E-14 Project/Datasets/asl-signs/CheckPoints/'

# Function to save a checkpoint
def save_checkpoint(model, optimizer, epoch, fold, path=CHECKPOINT_DIR):
    checkpoint_filename = f'checkpoint_fold_{fold}_epoch_{epoch}.pth'
    checkpoint_path = os.path.join(path, checkpoint_filename)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_path)

# Function to load the best checkpoint for a given fold
def load_best_checkpoint(model, fold, path=CHECKPOINT_DIR):
    checkpoint_filename = f'best_model_checkpoint_fold_{fold}.pth'
    checkpoint_path = os.path.join(path, checkpoint_filename)
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        print(f"Loaded best checkpoint for fold {fold} from {checkpoint_path}")
    else:
        print(f"No best checkpoint found for fold {fold} at {checkpoint_path}")


# Configuration
n_splits = 5
seed = 42
batch_size = 4  # Adjust as needed

# KFold split
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Path to save the checkpoint
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/DGMD E-14 Project/ModelCheckpoints/ISLR_First_Place_Check_Point.pth"

# KFold training loop
for fold, (train_idx, valid_idx) in enumerate(kfold.split(range(len(dataset)))):
    print(f"Training on fold {fold+1}")

    train_loader, val_loader = get_dataloaders(dataset, train_idx, valid_idx, batch_size)

    # Initialize model, criterion, optimizer for each fold
    model = ASLClassifierModel(num_classes=NUMBER_OF_CLASSES, channels=384, dim=192)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop for the current fold
    train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=5, fold=fold)

    # Load best checkpoint after training is done
    load_best_checkpoint(model, fold)


Training on fold 1
