In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install mediapipe opencv-python numpy matplotlib scikit-learn tqdm


Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
import os
import cv2
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [4]:
import os
import json

dataset_root = r"D:\Data science\datasets\Sign language Videos"
videos_folder = os.path.join(dataset_root, "videos")
json_file = os.path.join(dataset_root, "WLASL_v0.3.json")

with open(json_file, "r") as f:
    data = json.load(f)

# Filter only 20 words and map to local video path
WORDS = [
    "book", "computer", "chair", "clothes", "candy",
    "drink", "go", "walk", "help", "eat",
    "deaf", "fine", "thin", "black", "big",
    "who", "no", "yes", "before", "all"
]

dataset = []
for entry in data:
    if entry["gloss"] in WORDS:
        for instance in entry["instances"]:
            # assuming video files are named by instance_id
            video_filename = f"{instance['instance_id']}.mp4"  # adjust if file extension differs
            video_path = os.path.join(videos_folder, video_filename)
            if os.path.exists(video_path):
                dataset.append({
                    "word": entry["gloss"],
                    "video_path": video_path
                })

print("Total filtered samples:", len(dataset))


Total filtered samples: 0


In [8]:
import os

videos_folder = r"D:\Data science\datasets\Sign language Videos\videos"
video_files = os.listdir(videos_folder)
print(video_files[:20])


['00335.mp4', '00336.mp4', '00338.mp4', '00339.mp4', '00341.mp4', '00376.mp4', '00377.mp4', '00381.mp4', '00382.mp4', '00384.mp4', '00414.mp4', '00415.mp4', '00416.mp4', '00421.mp4', '00426.mp4', '00430.mp4', '00431.mp4', '00433.mp4', '00435.mp4', '00583.mp4']


In [9]:
import json

json_file = r"D:\Data science\datasets\Sign language Videos\WLASL_v0.3.json"

with open(json_file, "r") as f:
    data = json.load(f)

print("Total words in JSON:", len(data))
print("First entry keys:", data[0].keys())


Total words in JSON: 2000
First entry keys: dict_keys(['gloss', 'instances'])


In [12]:
import json
import os

# Load JSON data
json_file = r"D:\Data science\datasets\Sign language Videos\WLASL_v0.3.json"
with open(json_file, "r") as f:
    data = json.load(f)

# Define the list of gloss labels you're interested in
target_glosses = ["book", "computer", "chair", "clothes", "candy", "drink", "go", "walk", "help", "eat", "deaf", "fine", "thin", "black", "big", "who", "no", "yes", "before", "all"]

# Initialize a list to store the filtered video paths
filtered_videos = []

# Iterate through the JSON data and match with local files
for entry in data:
    gloss = entry["gloss"]
    if gloss in target_glosses:
        for instance in entry["instances"]:
            video_id = instance["video_id"]
            video_filename = f"{video_id}.mp4"  # Assuming the video files are named using video_id
            video_path = os.path.join(videos_folder, video_filename)
            if os.path.exists(video_path):
                filtered_videos.append({
                    "gloss": gloss,
                    "video_path": video_path,
                    "split": instance["split"]
                })

print(f"Total filtered samples: {len(filtered_videos)}")


Total filtered samples: 223


In [13]:
from sklearn.preprocessing import LabelEncoder

labels = [item["gloss"] for item in filtered_videos]
le = LabelEncoder()
le.fit(labels)
encoded_labels = le.transform(labels)

for i, word in enumerate(le.classes_):
    print(f"{i}: {word}")


0: all
1: before
2: big
3: black
4: book
5: candy
6: chair
7: clothes
8: computer
9: deaf
10: drink
11: eat
12: fine
13: go
14: help
15: no
16: thin
17: walk
18: who
19: yes


In [14]:
from collections import Counter

# Count videos per word
word_counts = Counter([item["gloss"] for item in filtered_videos])

# Display counts
for word, count in word_counts.items():
    print(f"{word}: {count}")

# Optional: show as sorted list
print("\nSorted by word:")
for word in sorted(word_counts.keys()):
    print(f"{word}: {word_counts[word]}")


book: 6
drink: 15
computer: 14
before: 16
chair: 7
go: 15
clothes: 5
who: 14
candy: 13
deaf: 11
fine: 9
help: 14
no: 11
thin: 16
walk: 11
yes: 12
all: 8
black: 10
eat: 7
big: 9

Sorted by word:
all: 8
before: 16
big: 9
black: 10
book: 6
candy: 13
chair: 7
clothes: 5
computer: 14
deaf: 11
drink: 15
eat: 7
fine: 9
go: 15
help: 14
no: 11
thin: 16
walk: 11
who: 14
yes: 12


In [15]:
from collections import Counter

word_counts = Counter([item["gloss"] for item in filtered_videos])
print("Original counts per word:")
for word, count in word_counts.items():
    print(f"{word}: {count}")


Original counts per word:
book: 6
drink: 15
computer: 14
before: 16
chair: 7
go: 15
clothes: 5
who: 14
candy: 13
deaf: 11
fine: 9
help: 14
no: 11
thin: 16
walk: 11
yes: 12
all: 8
black: 10
eat: 7
big: 9


In [16]:
min_count = min(word_counts.values())
print("Using", min_count, "videos per word to balance")


Using 5 videos per word to balance


In [17]:
import random

balanced_videos = []
for word in word_counts.keys():
    # Get all videos for this word
    word_videos = [v for v in filtered_videos if v["gloss"] == word]
    # Randomly select min_count videos
    balanced_videos.extend(random.sample(word_videos, min_count))

print("Total balanced samples:", len(balanced_videos))


Total balanced samples: 100


In [18]:
from sklearn.preprocessing import LabelEncoder

labels = [item["gloss"] for item in balanced_videos]
le = LabelEncoder()
le.fit(labels)
encoded_labels = le.transform(labels)

for i, word in enumerate(le.classes_):
    print(f"{i}: {word}")


0: all
1: before
2: big
3: black
4: book
5: candy
6: chair
7: clothes
8: computer
9: deaf
10: drink
11: eat
12: fine
13: go
14: help
15: no
16: thin
17: walk
18: who
19: yes


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader
import mediapipe as mp
import cv2
import numpy as np

mp_hands = mp.solutions.hands

# Hand landmarks extraction
def extract_hand_landmarks(video_path, n_frames=32):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if total_frames == 0:
        # If video cannot be read, return all zeros
        return np.zeros((n_frames, 42), dtype=np.float32)
    
    frame_indices = np.linspace(0, max(total_frames-1, 1), n_frames, dtype=int)
    
    hands_data = []
    with mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=1) as hands:
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret or frame is None:
                hands_data.append([0]*42)  # failed frame
                continue
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame_rgb)
            if results.multi_hand_landmarks:
                landmarks = results.multi_hand_landmarks[0]
                frame_landmarks = []
                for lm in landmarks.landmark:
                    frame_landmarks.append(lm.x)
                    frame_landmarks.append(lm.y)
                hands_data.append(frame_landmarks)
            else:
                hands_data.append([0]*42)  # no hand detected
    
    cap.release()
    
    # Final check: pad if still shorter
    while len(hands_data) < n_frames:
        hands_data.append([0]*42)
    
    hands_data = hands_data[:n_frames]  # truncate if longer
    return np.array(hands_data, dtype=np.float32)

# PyTorch Dataset
class SignDataset(Dataset):
    def __init__(self, videos, labels, n_frames=32):
        self.videos = videos
        self.labels = labels
        self.n_frames = n_frames
    def __len__(self):
        return len(self.videos)
    def __getitem__(self, idx):
        video_path = self.videos[idx]["video_path"]
        X = extract_hand_landmarks(video_path, self.n_frames)
        y = self.labels[idx]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)

dataset = SignDataset(balanced_videos, encoded_labels)


In [26]:
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")


Train samples: 80, Validation samples: 20


In [27]:
import torch.nn as nn

class SignLSTM(nn.Module):
    def __init__(self, input_size=42, hidden_size=64, num_layers=2, num_classes=20):
        super(SignLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 64).to(x.device)
        c0 = torch.zeros(2, x.size(0), 64).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # use last frame output
        return out

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SignLSTM(num_classes=len(le.classes_)).to(device)


In [28]:
class SignLSTM(nn.Module):
    def __init__(self, input_size=42, hidden_size=64, num_layers=2, num_classes=20):
        super(SignLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 64).to(x.device)
        c0 = torch.zeros(2, x.size(0), 64).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # take last frame
        return out

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SignLSTM(num_classes=len(le.classes_)).to(device)


In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs_val = model(X_val)
            _, predicted = torch.max(outputs_val, 1)
            total += y_val.size(0)
            correct += (predicted == y_val).sum().item()
    val_acc = 100 * correct / total
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_acc:.2f}%")


Epoch 1/10, Loss: 3.0102, Val Accuracy: 5.00%
Epoch 2/10, Loss: 2.9941, Val Accuracy: 5.00%
Epoch 3/10, Loss: 2.9880, Val Accuracy: 0.00%
Epoch 4/10, Loss: 2.9744, Val Accuracy: 0.00%
Epoch 5/10, Loss: 2.9619, Val Accuracy: 0.00%
Epoch 6/10, Loss: 2.9337, Val Accuracy: 0.00%
Epoch 7/10, Loss: 2.9195, Val Accuracy: 0.00%
Epoch 8/10, Loss: 2.9003, Val Accuracy: 0.00%
Epoch 9/10, Loss: 2.8512, Val Accuracy: 0.00%
Epoch 10/10, Loss: 2.7640, Val Accuracy: 0.00%


In [30]:
torch.save(model.state_dict(), "asl_lstm_model.pth")
print("Model weights saved as 'asl_lstm_model.pth'")


Model weights saved as 'asl_lstm_model.pth'


In [31]:
torch.save(model, "asl_lstm_full_model.pth")
print("Full model saved as 'asl_lstm_full_model.pth'")


Full model saved as 'asl_lstm_full_model.pth'


In [32]:
model = SignLSTM(num_classes=20)  # same architecture
model.load_state_dict(torch.load("asl_lstm_model.pth"))
model.eval()  # set to evaluation mode


SignLSTM(
  (lstm): LSTM(42, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=20, bias=True)
)