# preprocessing

In [None]:
import os
import cv2
import numpy as np
import torch
from torchvision import transforms


In [None]:
# Step 1: Frame Extraction
def extract_frames(video_path, frame_rate=1):
    print(f"Extracting frames from {video_path}...")
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frames.append(frame)
        count += 1
    cap.release()
    print(f"Extracted {len(frames)} frames from {video_path}.")
    return frames

# Step 2: Normalization & Resizing
def preprocess_frame(frame, size=(224, 224)):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    frame = transform(frame)
    # print("Frame preprocessed.")
    return frame

# Step 3: Augmentation (optional)
def augment_frames(frames):
    print("Augmenting frames...")
    transform = transforms.Compose([
        transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0))
    ])
    augmented_frames = []
    for frame in frames:
        frame = transform(frame)
        augmented_frames.append(frame)
    print(f"Augmented {len(augmented_frames)} frames.")
    return augmented_frames

# Step 4: Label Assignment
def get_label_from_filename(filename):
    label = filename.split()[1].split('.')[0]
    print(f"Assigned label '{label}' to {filename}.")
    return label

# Main Preprocessing Function
def preprocess_videos(directory):
    print(f"Preprocessing videos in directory: {directory}")
    video_data = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".mp4"):
            video_path = os.path.join(directory, filename)
            frames = extract_frames(video_path)
            frames = [preprocess_frame(frame) for frame in frames]
            frames = augment_frames(frames)  # Optional
            label = get_label_from_filename(filename)
            video_data.append(frames)
            labels.append(label)
    print("Preprocessing complete.")
    return video_data, labels



In [None]:
# Example usage
video_directory = 'Data1/Words/'
video_data, labels = preprocess_videos(video_directory)

# Save preprocessed data for later use
torch.save(video_data, 'video_data.pt')
torch.save(labels, 'labels.pt')
print("Data saved.")


# output:

Preprocessing videos in directory: Data1/Words/
Extracting frames from Data1/Words/a0 butcher.mp4...
Extracted 58 frames from Data1/Words/a0 butcher.mp4.
Augmenting frames...
Augmented 58 frames.
Assigned label 'butcher' to a0 butcher.mp4.
Extracting frames from Data1/Words/a1 aadhar_card.mp4...
Extracted 94 frames from Data1/Words/a1 aadhar_card.mp4.
Augmenting frames...
Augmented 94 frames.
Assigned label 'aadhar_card' to a1 aadhar_card.mp4.
Extracting frames from Data1/Words/a3 dedication.mp4...
Extracted 61 frames from Data1/Words/a3 dedication.mp4.
Augmenting frames...
Augmented 61 frames.
Assigned label 'dedication' to a3 dedication.mp4.
Extracting frames from Data1/Words/a2 cinematography.mp4...
Extracted 72 frames from Data1/Words/a2 cinematography.mp4.
Augmenting frames...
Augmented 72 frames.
Assigned label 'cinematography' to a2 cinematography.mp4.
Preprocessing complete.
Data saved.

# 2. Model Architecture (Isolated Sign Recognition):
This model will be trained on your isolated dictionary 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models


In [None]:
class SignRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(SignRecognitionModel, self).__init__()
        # Use a pre-trained 2D-CNN (e.g., ResNet) as the feature extractor
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final fully connected layer
        
        # LSTM to capture temporal relationships
        self.lstm = nn.LSTM(input_size=512, hidden_size=256, num_layers=2, batch_first=True)
        
        # Fully connected layer for classification
        self.fc = nn.Linear(256, num_classes)
        
    def forward(self, x):
        print("Forward pass started.")
        batch_size, seq_length, c, h, w = x.size()
        cnn_out = []
        
        # Pass each frame through the CNN
        for t in range(seq_length):
            frame = x[:, t, :, :, :]
            cnn_out.append(self.cnn(frame))
            print(f"Processed frame {t+1}/{seq_length} through CNN.")
        
        # Stack the CNN outputs and pass through LSTM
        cnn_out = torch.stack(cnn_out, dim=1)
        print("Stacked CNN outputs.")
        lstm_out, _ = self.lstm(cnn_out)
        print("Passed through LSTM.")
        
        # Take the output of the last LSTM cell
        lstm_out = lstm_out[:, -1, :]
        print("Extracted output from the last LSTM cell.")
        
        # Pass through the fully connected layer
        out = self.fc(lstm_out)
        print("Passed through the fully connected layer.")
        print("Forward pass completed.")
        return out




In [None]:

# Example usage
num_classes = 4  # Replace with the actual number of classes
model = SignRecognitionModel(num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop
def train_model(model, dataloader, criterion, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}')

# Assuming you have a DataLoader `dataloader` for your dataset
# train_model(model, dataloader, criterion, optimizer)