In [1]:
import os 
import sys
import json 
import cv2
import numpy as np
import mediapipe as mp
import pandas as pd 

In [7]:
with open('data/WLASL_v0.3.json') as f:
    data = json.load(f)

In [8]:
# importing video data and their respecive labels
df = pd.read_csv('WLASL_v0.3.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'WLASL_v0.3.csv'

In [9]:
# Create a dictionary to map video IDs to labels
video_to_label = {}

for entry in data:
    label = entry['gloss']  # The word or label
    for instance in entry['instances']:
        video_id = instance['video_id']
        video_to_label[video_id] = label

# Print the first few mappings to verify
for video_id, label in list(video_to_label.items())[:5]:
    print(f"Video ID: {video_id}, Label: {label}")

Video ID: 69241, Label: book
Video ID: 65225, Label: book
Video ID: 68011, Label: book
Video ID: 68208, Label: book
Video ID: 68012, Label: book


In [14]:
len(video_to_label) # 21083 videos and their respective labels according to the json file 

21083

In [15]:
# going to the example videos folder and checking if the video files are there or not 

# Path to the videos folder
videos_folder = 'example_videos'

# Verify that each video ID in the mapping has a corresponding video file
for video_id in video_to_label.keys():
    video_path = os.path.join(videos_folder, f"{video_id}.mp4")
    if not os.path.exists(video_path):
        print(f"Warning: Video file for ID {video_id} does not exist.")
    else:
        print(f"Video file for ID {video_id} exists and is correctly mapped to label {video_to_label[video_id]}.")



In [16]:
example_valid_video_to_label = {video_id: label for video_id, label in video_to_label.items() if os.path.exists(os.path.join(videos_folder, f"{video_id}.mp4"))}

# Base directory to store extracted frames
base_output_dir = 'example_extracted_frames'

# Create directories for each gloss
for label in set(example_valid_video_to_label.values()):
    os.makedirs(os.path.join(base_output_dir, label), exist_ok=True)

In [17]:
# Function to extract frames from a video
def extract_frames(video_path, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
            
        # Save frame
        frame_path = os.path.join(output_dir, f'frame_{frame_count:04d}.jpg')
        cv2.imwrite(frame_path, frame)
        frame_count += 1
        
    cap.release()

# Process each video
for video_id, label in example_valid_video_to_label.items():
    video_path = os.path.join(videos_folder, f"{video_id}.mp4")
    output_dir = os.path.join(base_output_dir, label, video_id)
    
    # Create directory for this specific video's frames
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Extracting frames from video {video_id} with label {label}")
    extract_frames(video_path, output_dir)


## Loading The Dataset we created

In [28]:
from PIL import Image
import os
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from sklearn.model_selection import train_test_split

class VideoDataset(Dataset):
    """
    This dataset expects that the videos are saved as folders of JPEG frames.
    
    The directory structure is assumed to be:
      root_dir/
          gloss_1/
              video_id_1/
                  frame001.jpg
                  frame002.jpg
                  ...
              video_id_2/
                  ...
          gloss_2/
              ...
    
    Args:
      root_dir (str): Path to the root directory.
      label2idx (dict): Mapping from gloss (str) to integer label.
      num_frames (int): Number of frames to sample per video.
      transform (callable, optional): Transformations to apply on the images.
      split (str, optional): "train" or "val". If provided splits the data (80% training, 20% validation).
    """
    def __init__(self, root_dir, label2idx, num_frames=16, transform=None, split=None):
        self.root_dir = root_dir
        self.label2idx = label2idx
        self.num_frames = num_frames
        self.transform = transform
        self.videos = []  # list of dictionaries: {'frames': list_of_frame_paths, 'label': int}
        
        # Walk through the directory structure and collect video data.
        for label in os.listdir(root_dir):
            label_path = os.path.join(root_dir, label)
            if os.path.isdir(label_path):
                for video in os.listdir(label_path):
                    video_path = os.path.join(label_path, video)
                    if os.path.isdir(video_path):
                        # Gather all JPEG frame paths and sort to maintain temporal order.
                        frame_files = sorted([
                            os.path.join(video_path, f)
                            for f in os.listdir(video_path)
                            if f.lower().endswith(('.jpeg', '.jpg'))
                        ])
                        # Skip videos with fewer frames than required.
                        if len(frame_files) < self.num_frames:
                            continue
                        self.videos.append({
                            'frames': frame_files,
                            'label': self.label2idx[label]
                        })
        
        # If split is specified, partition the videos list into train and validation sets.
        if split is not None:
            train_videos, val_videos = train_test_split(self.videos, test_size=0.2, random_state=42)
            if split == "train":
                self.videos = train_videos
            elif split == "val":
                self.videos = val_videos

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        video_info = self.videos[idx]
        frames = video_info['frames']
        
        # Uniformly sample exactly num_frames from the total available frames.
        total_frames = len(frames)
        interval = total_frames // self.num_frames
        selected_frames = [frames[i * interval] for i in range(self.num_frames)]
        
        video_tensor = []
        for frame_path in selected_frames:
            image = Image.open(frame_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            video_tensor.append(image)
        
        # Stack the frames to create a tensor of shape (num_frames, C, H, W)
        video_tensor = torch.stack(video_tensor)
        label = video_info['label']
        return video_tensor, label

In [29]:
import torch
import torch.nn as nn
import torchvision.models as models

class LRCN(nn.Module):
    """
    LRCN model that first extracts spatial features from each frame using a CNN
    (here, a pretrained ResNet18) and then models the temporal information with an LSTM.
    
    Args:
      num_classes (int): Number of output classes (unique glosses).
      hidden_size (int): Hidden layer size of the LSTM.
      num_layers (int): Number of LSTM layers.
      pretrained (bool): Whether to use pre-trained weights for the CNN.
    """
    def __init__(self, num_classes, hidden_size=256, num_layers=1, pretrained=True):
        super(LRCN, self).__init__()
        # Use ResNet18 as the CNN feature extractor
        resnet = models.resnet18(pretrained=pretrained)
        # Remove the final fully connected layer.
        modules = list(resnet.children())[:-1]  
        self.cnn = nn.Sequential(*modules)
        self.feature_size = resnet.fc.in_features  # typically 512 for resnet18
        
        # LSTM to process sequential features.
        self.lstm = nn.LSTM(input_size=self.feature_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        """
        Input:
          x: Tensor of shape (batch, seq_len, C, H, W)
        Output:
          out: Tensor of shape (batch, num_classes) with raw scores.
        """
        batch_size, seq_len, C, H, W = x.size()
        # Merge batch and sequence dimensions to process all frames through the CNN at once.
        x = x.view(batch_size * seq_len, C, H, W)
        features = self.cnn(x)  # shape becomes (batch * seq_len, feature_size, 1, 1)
        features = features.view(batch_size, seq_len, self.feature_size)
        
        # Pass the sequence of features through the LSTM.
        lstm_out, (hn, cn) = self.lstm(features)
        # Use the last output of the LSTM for classification.
        out = self.fc(lstm_out[:, -1, :])
        return out 

In [38]:
import os
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from datasets.video_dataset import VideoDataset
from models.lrcn import LRCN

# -------------------------
# Hyperparameters & Settings
# -------------------------
data_root = 'example_extracted_frames'  # Root folder where frames are stored
num_frames = 16           # Number of frames to sample per video
batch_size = 8
num_epochs = 10
learning_rate = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------------
# Creating Label Mapping
# -------------------------
# Each folder in data_root represents a gloss. For repeatability, sort them.
labels = sorted([d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))])
label2idx = {label: idx for idx, label in enumerate(labels)}
num_classes = len(label2idx)
print("Label mapping:", label2idx)

# -------------------------
# Define Image Transforms
# -------------------------
# We resize images to 224x224 (common for ResNet) and normalize with ImageNet mean/std.
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225]),
])

if __name__ == '__main__':
    # -------------------------
    # Dataset and DataLoader
    # -------------------------
    dataset = VideoDataset(root_dir=data_root, label2idx=label2idx,
                           num_frames=num_frames, transform=transform)
    # For debugging on Windows, start by setting num_workers=0.
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    
    # -------------------------
    # Instantiate the Model, Loss, and Optimizer
    # -------------------------
    model = LRCN(num_classes=num_classes, hidden_size=256, num_layers=1, pretrained=True).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # -------------------------
    # Training Loop
    # -------------------------
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for videos, labels in dataloader:
            videos = videos.to(device)
            labels = labels.to(device)
    
            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item() * videos.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / len(dataset)
        epoch_acc = correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}")
    
    torch.save(model.state_dict(), "lrcn_model.pth")

Using device: cuda
Label mapping: {'abdomen': 0, 'able': 1, 'about': 2}




Epoch 1/10, Loss: 1.1054, Accuracy: 0.4545
Epoch 2/10, Loss: 0.8563, Accuracy: 0.7273
Epoch 3/10, Loss: 0.6317, Accuracy: 0.9091
Epoch 4/10, Loss: 0.5184, Accuracy: 1.0000
Epoch 5/10, Loss: 0.3957, Accuracy: 1.0000
Epoch 6/10, Loss: 0.3179, Accuracy: 1.0000
Epoch 7/10, Loss: 0.3017, Accuracy: 0.9091
Epoch 8/10, Loss: 0.3163, Accuracy: 0.9091
Epoch 9/10, Loss: 0.1726, Accuracy: 1.0000
Epoch 10/10, Loss: 0.2504, Accuracy: 0.9091


In [36]:
import os
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from torchvision import transforms
from datasets.video_dataset import VideoDataset
from models.lrcn import LRCN
from sklearn.metrics import confusion_matrix, classification_report

# -------------------------
# Settings and Hyperparameters
# -------------------------
data_root = 'example_extracted_frames'  # No train/val split in this example; you may want to create one.
num_frames = 16
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------
# Label Mapping
# -------------------------
# Each folder in data_root represents a unique gloss/label.
labels = sorted([d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))])
label2idx = {label: idx for idx, label in enumerate(labels)}
# For reporting, create a reverse mapping: integer -> label name.
idx2label = {idx: label for label, idx in label2idx.items()}
num_classes = len(label2idx)
print("Label mapping:", label2idx)

# -------------------------
# Define Transforms (same as training)
# -------------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Helps match input size for ResNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225]),
])

# -------------------------
# Create the Dataset and DataLoader for Evaluation
# -------------------------
dataset = VideoDataset(root_dir=data_root, label2idx=label2idx,
                       num_frames=num_frames, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# -------------------------
# Load the Trained Model
# -------------------------
model = LRCN(num_classes=num_classes, hidden_size=256, num_layers=1, pretrained=True).to(device)
model.load_state_dict(torch.load("lrcn_model.pth", map_location=device))
model.eval()  # IMPORTANT: Set the model to evaluation mode

criterion = nn.CrossEntropyLoss()

# -------------------------
# Evaluation Function
# -------------------------
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for videos, labels in dataloader:
            videos = videos.to(device)
            labels = labels.to(device)
            
            outputs = model(videos)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * videos.size(0)
            
            _, preds = torch.max(outputs, 1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    return avg_loss, accuracy, all_preds, all_labels

# -------------------------
# Run Evaluation
# -------------------------
avg_loss, accuracy, all_preds, all_labels = evaluate(model, dataloader, criterion, device)
print(f"Evaluation Loss: {avg_loss:.4f}")
print(f"Evaluation Accuracy: {accuracy:.4f}")

# -------------------------
# Further Analysis with Confusion Matrix & Classification Report
# -------------------------
cm = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(cm)

report = classification_report(all_labels, all_preds, target_names=labels)
print("Classification Report:")
print(report)

Label mapping: {'abdomen': 0, 'able': 1, 'about': 2}


  model.load_state_dict(torch.load("lrcn_model.pth", map_location=device))


Evaluation Loss: 0.0845
Evaluation Accuracy: 1.0000
Confusion Matrix:
[[5 0 0]
 [0 5 0]
 [0 0 1]]
Classification Report:
              precision    recall  f1-score   support

     abdomen       1.00      1.00      1.00         5
        able       1.00      1.00      1.00         5
       about       1.00      1.00      1.00         1

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

