### Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#### Install Pytorch

In [1]:
import torch

TORCH = torch.__version__.split('+')[0]
CUDA = 'cu' + torch.version.cuda.replace('.', '')

# Construct the installation command
install_command = f"pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html"

# Execute the command
!{install_command}

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu118/pyg_lib-0.3.1%2Bpt21cu118-cp310-cp310-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_scatter-2.1.2%2Bpt21cu118-cp310-cp310-linux_x86_64.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_sparse-0.6.18%2Bpt21cu118-cp310-cp310-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m122.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_cluster
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_cluster-1.6.3%2B

In [None]:
import os
import torch
import numpy as np

# Set a random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

### `ASLDatasetLoader` Class

The `ASLDatasetLoader` class is designed for loading and processing the ASL dataset. Given a directory, it reads sign language data from JSON files and constructs graph representations suitable for graph-based neural networks. Crucially, the class converts JSON data into PyTorch Geometric (PyG) `Data` objects comprising `x` (node features), `edge_index` (graph connectivity), and `y' (labels) attributes.

**Methods**:

- `_create_sign_to_label_map`: Generates a mapping from sign names to unique labels.

- `_read_file_data`: Reads data from a given JSON file.

- `_augment_data`: Implements data augmentation by applying random rotation, translation, and scaling to landmarks, which can enhance the model's robustness.

- `_create_graph_from_frame`: Constructs a PyG `Data` object from frame data, concentrating on hand and face landmarks. Edges are created between consecutive landmarks and between left and right hand landmarks. Additional features, like hand-to-face distances, are also computed.

- `get_dataset`: Assembles the dataset, optionally incorporating data augmentation. The function outputs a list of PyG `Data` objects ready for graph neural network processing.

In [None]:
import os
import torch
import json
import numpy as np
from torch.utils.data import Dataset

class ASLInMemoryDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        self.root = root
        self.transform = transform
        self.pre_transform = pre_transform
        self.processed_dir = os.path.join(self.root, 'processed_2')  # New processed directory
        self.loader = ASLDatasetLoader(self.processed_dir)

        # Check if processed data exists, if not, process and save the data
        if not os.path.exists(self.processed_dir) or not os.listdir(self.processed_dir):
            os.makedirs(self.processed_dir, exist_ok=True)
            self.data, self.labels = self.loader.load_data()
            torch.save((self.data, self.labels), os.path.join(self.processed_dir, 'processed_data_2.pt'))
        else:
            self.data, self.labels = torch.load(os.path.join(self.processed_dir, 'processed_data_2.pt'))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample, label

    def num_features(self):
        # Assumes all samples have the same shape
        return self.data[0].shape[1] * self.data[0].shape[2] if len(self.data) > 0 else 0

    def num_classes(self):
        return len(set(self.labels))

In [None]:
import os
import json
import numpy as np
import torch

class ASLDatasetLoader:
    def __init__(self, directory_path):
        self.directory_path = directory_path

    def load_data(self):
        data = []
        labels = []
        for filename in os.listdir(self.directory_path):
            file_path = os.path.join(self.directory_path, filename)
            with open(file_path, 'r') as file:
                content = json.load(file)
                for example in content["examples"]:
                    for frame in example["frames"]:
                        frame_data = self._process_frame_data(frame)
                        augmented_data = self._augment_data(frame_data)
                        data.append(augmented_data)
                        labels.append(content["sign"])

        return np.array(data), np.array(labels)

    def _process_frame_data(self, frame):
        # Assuming each frame has landmarks data in a specific format
        landmarks = np.array([[landmark['x'], landmark['y']] for landmark in frame['landmarks']])
        return landmarks

    def _augment_data(self, frame_data, rotation_range=10, translation_range=0.05, scaling_range=0.1, jittering_range=0.01, noise_scale=0.01, mirroring_prob=0.5):
        # Apply jittering
        jittering = np.random.uniform(-jittering_range, jittering_range, frame_data.shape)
        frame_data += jittering

        # Apply random rotation
        theta = np.radians(np.random.uniform(-rotation_range, rotation_range))
        rotation_matrix = np.array([
            [np.cos(theta), -np.sin(theta)],
            [np.sin(theta),  np.cos(theta)]
        ])
        frame_data = np.dot(frame_data - frame_data.mean(axis=0), rotation_matrix)

        # Apply random translation
        translation = np.random.uniform(-translation_range, translation_range, frame_data.shape[1])
        frame_data += translation

        # Apply random scaling
        scaling_factor = np.random.uniform(1 - scaling_range, 1 + scaling_range)
        frame_data = frame_data * scaling_factor

        # Apply noise injection
        noise = np.random.normal(0, noise_scale, frame_data.shape)
        frame_data += noise

        # Apply mirroring with a probability
        if np.random.rand() < mirroring_prob:
            frame_data[:, 0] = -frame_data[:, 0]  # Mirroring on x-axis

        return frame_data


### `ASLGraphClassifier` Class

The `ASLGraphClassifier`, features deeper GCN layers and additional channels to capture intricate data patterns potentially. It takes a PyG `Data` object as input, and its forward pass emits class logits.

**Methods**:

- `forward`: Details the forward pass, accepting a PyG `Data` object. Two GCN layers with subsequent batch normalization and dropout layers process the input. Post global max-pooling, two linear layers coupled with dropout ensure final classification, leading to log-softmax outputs.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Conv1DBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dropout_rate):
        super(Conv1DBlock, self).__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2)
        self.bn = nn.BatchNorm1d(out_channels)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.conv(x)
        x = F.relu(x)
        x = self.bn(x)
        x = self.dropout(x)
        return x

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout_rate, expand_ratio=2):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear1 = nn.Linear(embed_dim, embed_dim * expand_ratio)
        self.linear2 = nn.Linear(embed_dim * expand_ratio, embed_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        identity = x
        x, _ = self.attention(x, x, x)
        x = x + identity
        x = self.norm1(x)

        identity = x
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        x = x + identity
        x = self.norm2(x)

        return x

class ASLClassifier(nn.Module):
    def __init__(self, max_len, num_channels, num_classes, dim=192):
        super(ASLClassifier, self).__init__()
        self.stem_conv = nn.Linear(max_len * num_channels, dim)
        self.stem_bn = nn.BatchNorm1d(dim)

        # Convolutional and Transformer blocks
        self.layer1 = Conv1DBlock(dim, dim, kernel_size=17, dropout_rate=0.2)
        self.transformer1 = TransformerBlock(dim, num_heads=8, dropout_rate=0.2)

        self.layer2 = Conv1DBlock(dim, dim, kernel_size=17, dropout_rate=0.2)
        self.transformer2 = TransformerBlock(dim, num_heads=8, dropout_rate=0.2)

        # if dim == 384:  # For larger model size
        #     # Add additional Conv1D and Transformer layers as needed

        self.top_conv = nn.Linear(dim, dim * 2)
        self.classifier = nn.Linear(dim * 2, num_classes)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.8)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.stem_conv(x)
        x = self.stem_bn(x)
        x = x.view(x.size(0), -1, x.size(1))  # Reshape for Conv1D

        x = self.layer1(x)
        x = self.transformer1(x)

        x = self.layer2(x)
        x = self.transformer2(x)

        # if x.size(1) == 384:  # Adjust for larger model size
        #     # Apply additional layers if needed

        x = self.top_conv(x)
        x = self.global_avg_pool(x).view(x.size(0), -1)  # Global average pooling
        x = self.dropout(x)
        x = self.classifier(x)

        return F.log_softmax(x, dim=1)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast

EPOCHS = 100
LEARNING_RATE = 0.0005
WEIGHT_DECAY = 5e-4
BATCH_SIZE = 7
GRADIENT_ACCUM_STEPS = 4  # For gradient accumulation
WORKERS = 2

def stratified_data_split(dataset, test_size=0.2):
    labels = [label for _, label in dataset]
    train_data, test_data = train_test_split(dataset, test_size=test_size, stratify=labels, random_state=42)
    return train_data, test_data

def validate(loader, model, device):
    model.eval()
    all_preds = []
    all_labels = []

    for data, labels in loader:
        data, labels = data.to(device), labels.to(device)
        with torch.no_grad():
            out = model(data)
        pred = out.argmax(dim=1)
        all_preds.append(pred.cpu())
        all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    accuracy = (all_preds == all_labels).float().mean().item()

    metrics = {
        'accuracy': accuracy,
        'precision': precision_score(all_labels.numpy(), all_preds.numpy(), average='macro', zero_division=1),
        'recall': recall_score(all_labels.numpy(), all_preds.numpy(), average='macro', zero_division=1),
        'f1': f1_score(all_labels.numpy(), all_preds.numpy(), average='macro', zero_division=1)
    }
    return metrics


def train(train_data, val_data, num_classes, input_channels, epochs=100, learning_rate=0.0005, weight_decay=5e-4, patience=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = ASLClassifier(input_channels=input_channels, num_classes=num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.7, patience=2, verbose=True)
    criterion = torch.nn.CrossEntropyLoss()

    # Check if CUDA is available for mixed precision training
    use_amp = torch.cuda.is_available()
    scaler = GradScaler() if use_amp else None

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=WORKERS)
    val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=WORKERS)

    best_val_accuracy = 0.0
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        for batch_idx, (data, labels) in enumerate(tqdm(train_loader, desc="Training")):
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()

            with autocast(enabled=use_amp):
                outputs = model(data)
                loss = criterion(outputs, labels)

            if use_amp:
                scaler.scale(loss).backward()
                if (batch_idx + 1) % GRADIENT_ACCUM_STEPS == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
            else:
                loss.backward()
                if (batch_idx + 1) % GRADIENT_ACCUM_STEPS == 0:
                    optimizer.step()
                    optimizer.zero_grad()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(train_loader)
        train_accuracy = (np.array(all_preds) == np.array(all_labels)).mean()

        val_metrics = validate(val_loader, model, device)
        scheduler.step(val_metrics['accuracy'])

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}")

        if val_metrics['accuracy'] > best_val_accuracy:
            best_val_accuracy = val_metrics['accuracy']
            epochs_without_improvement = 0
            # Save the model if needed
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping")
            break

    return model, all_preds, all_labels, train_accuracy

In [None]:
directory_path = "/content/drive/MyDrive/Colab Notebooks/DGMD E-14 Project/Datasets/processed-40-500"
# Create an instance of the ASLInMemoryDataset
dataset = ASLInMemoryDataset(root=directory_path)

In [None]:
# Split the dataset into training and validation subsets
train_data, val_data = stratified_data_split(dataset)

ValueError: ignored

In [None]:
# Train the model using the datasets
model, all_preds, all_labels, accuracy = train(train_data, val_data, dataset.num_classes(), dataset.num_features(), epochs=EPOCHS, learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [None]:
dataset.sign_to_label()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Convert lists to numpy arrays for compatibility with sklearn
y_true = np.array(all_labels)
y_pred = np.array(all_preds)

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Ensure the class names are in the correct order for target_names
ordered_class_names = [name for name, num in sorted((dataset.sign_to_label()).items(), key=lambda item: item[1])]

# Per-Class Accuracy
class_accuracy = cm.diagonal() / cm.sum(axis=1)
for i, acc in enumerate(class_accuracy):
    class_name = ordered_class_names[i]
    print(f"Accuracy for class {i} ({class_name}): {acc*100:.2f}%")

# Detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=ordered_class_names, zero_division=1))

In [None]:
from sklearn.metrics import classification_report

def print_top_misclassified_classes(y_true, y_pred, sign_to_label, N=3, zero_division=1):
    """
    Prints the top N classes that get misclassified the most.

    Parameters:
    - y_true: Actual labels
    - y_pred: Predicted labels by the model
    - sign_to_label: Dictionary mapping class names to class numbers
    - N: Number of top misclassified classes to print
    - zero_division: Parameter for handling zero division in classification_report

    Returns:
    None
    """

    # Ensure the class names are in the correct order for target_names
    ordered_class_names = [name for name, num in sorted(sign_to_label.items(), key=lambda item: item[1])]

    # Generate and print classification report with class names
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred, target_names=ordered_class_names, zero_division=zero_division))

    # Generate classification report as dict to find misclassified classes
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=zero_division)

    # Create a dictionary to store misclassification rates
    misclassification_rates = {}

    # Iterate through each class in the report
    for class_num, metrics in report.items():
        if class_num.isdigit():
            class_name = [key for key, value in sign_to_label.items() if value == int(class_num)][0]
            misclassification_rates[class_name] = 1 - metrics['recall']

    # Sort classes based on misclassification rate
    sorted_classes = sorted(misclassification_rates, key=misclassification_rates.get, reverse=True)

    # Print top N misclassified classes
    print(f"\nTop {N} misclassified classes:")
    for i in range(N):
        class_name = sorted_classes[i]
        print(f"{i+1}. {class_name} - Misclassification rate: {misclassification_rates[class_name]:.2f}")

In [None]:
print_top_misclassified_classes(y_true, y_pred, dataset.sign_to_label(), N=10, zero_division=1)