In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os

In [2]:
print("Khushi")

Khushi


In [3]:
import cv2

import mediapipe as mp
import numpy as np
import os



# Initialize MediaPipe Hands and Face Mesh models
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh

In [4]:
# Function to blackout the background and keep only the hand movements
def blackout_background(video_path, output_path):
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
    cap = cv2.VideoCapture(video_path)

    # Get video details
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')

    # Video writer to save the processed video
    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the image to RGB as MediaPipe expects this format
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Get hand landmarks
        result = hands.process(image_rgb)

        # Create a blackout frame (all black)
        black_frame = np.zeros_like(frame)

        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                # Draw the hand landmarks on the black frame
                mp_drawing.draw_landmarks(
                    black_frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2)
                )

        # Write the black_frame with only hands visible to output video
        out.write(black_frame)

    cap.release()
    out.release()
    hands.close()

In [7]:

# Example usage: Process the dictionary videos
dictionary_videos_path = "D:/Ace/7th sem files/1. REU/dataprep2"  # Replace with actual path
blackout_videos_output_path = "D:/Ace/7th sem files/1. REU/dataprep2blackout"  # Replace with actual output folder


In [8]:

# Iterate over all dictionary videos and process each
for video_file in os.listdir(dictionary_videos_path):
    if video_file.endswith(".mp4"):  # Process only .mp4 videos (adjust if necessary)
        video_path = os.path.join(dictionary_videos_path, video_file)
        output_path = os.path.join(blackout_videos_output_path, video_file)
        blackout_background(video_path, output_path)

print("All dictionary videos processed and saved in the D:/Ace/7th sem files/1. REU/dataprep2blackout")


All dictionary videos processed and saved in the D:/Ace/7th sem files/1. REU/dataprep2blackout


In [9]:
# Step 1: Frame Extraction
def extract_frames(video_path, frame_rate=1):
    print(f"Extracting frames from {video_path}...")
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frames.append(frame)
        count += 1
    cap.release()
    print(f"Extracted {len(frames)} frames from {video_path}.")
    return frames

# Step 2: Normalization & Resizing
def preprocess_frame(frame, size=(224, 224)):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    frame = transform(frame)
    # print("Frame preprocessed.")
    return frame

# Step 3: Augmentation (optional)
def augment_frames(frames):
    print("Augmenting frames...")
    transform = transforms.Compose([
        transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0))
    ])
    augmented_frames = []
    for frame in frames:
        frame = transform(frame)
        augmented_frames.append(frame)
    print(f"Augmented {len(augmented_frames)} frames.")
    return augmented_frames

# Step 4: Label Assignment
# def get_label_from_filename(filename):
#     label = filename.split()[1].split('.')[0]
#     print(f"Assigned label '{label}' to {filename}.")
#     return label

def get_label_from_filename(filename):
    # Assuming the label is the part before the extension (e.g., "aadhar_card.mp4")
    label = filename.split('.')[0]  # Get the filename without the extension
    print(f"Assigned label '{label}' to {filename}.")
    return label


# Main Preprocessing Function
def preprocess_videos(directory):
    print(f"Preprocessing videos in directory: {directory}")
    video_data = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".mp4"):
            video_path = os.path.join(directory, filename)
            frames = extract_frames(video_path)
            frames = [preprocess_frame(frame) for frame in frames]
            frames = augment_frames(frames)  # Optional
            label = get_label_from_filename(filename)
            video_data.append(frames)
            labels.append(label)
    print("Preprocessing complete.")
    return video_data, labels


import torch
from torchvision import transforms


# Example usage
video_directory = blackout_videos_output_path
video_data, labels = preprocess_videos(video_directory)

# Save preprocessed data for later use
torch.save(video_data, 'video_data.pt')
torch.save(labels, 'labels.pt')
print("Data saved.")


Preprocessing videos in directory: D:/Ace/7th sem files/1. REU/dataprep2blackout
Extracting frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\aadhar_card.mp4...
Extracted 94 frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\aadhar_card.mp4.
Augmenting frames...
Augmented 94 frames.
Assigned label 'aadhar_card' to aadhar_card.mp4.
Extracting frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\accelerate.mp4...
Extracted 49 frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\accelerate.mp4.
Augmenting frames...
Augmented 49 frames.
Assigned label 'accelerate' to accelerate.mp4.
Extracting frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\accessibility_grants 2.mp4...
Extracted 64 frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\accessibility_grants 2.mp4.
Augmenting frames...
Augmented 64 frames.
Assigned label 'accessibility_grants 2' to accessibility_grants 2.mp4.
Extracting frames from D:/Ace/7th sem files/1. REU/dataprep2blackout\accessibi

In [12]:
# labelling

# Create a label mapping
unique_labels = list(set(labels))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

# Convert string labels to numerical labels
numerical_labels = [label_to_index[label] for label in labels]

import torch
from torch.utils.data import Dataset

class SignDataset(Dataset):
    def __init__(self, video_data, labels, transform=None, max_frames=100):
        self.video_data = video_data
        self.labels = labels
        self.transform = transform
        self.max_frames = max_frames

    def __len__(self):
        return len(self.video_data)

    def __getitem__(self, idx):
        video = self.video_data[idx]
        label = self.labels[idx]

        # Pad or truncate video to max_frames
        if len(video) < self.max_frames:
            padding = [torch.zeros_like(video[0])] * (self.max_frames - len(video))
            video.extend(padding)
        else:
            video = video[:self.max_frames]

        if self.transform:
            video = [self.transform(frame) for frame in video]
        video = torch.stack(video)
        label = torch.tensor(label, dtype=torch.long)  # Ensure label is a tensor of type long
        return video, label
    
# Model architecture

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models

class SignRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(SignRecognitionModel, self).__init__()
        # Use a pre-trained 2D-CNN (e.g., ResNet) as the feature extractor
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final fully connected layer

        # LSTM to capture temporal relationships
        self.lstm = nn.LSTM(input_size=512, hidden_size=256, num_layers=2, batch_first=True)

        # Fully connected layer for classification
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        print("Forward pass started.")
        batch_size, seq_length, c, h, w = x.size()
        cnn_out = []

        # Pass each frame through the CNN
        for t in range(seq_length):
            frame = x[:, t, :, :, :]
            cnn_out.append(self.cnn(frame))
            print(f"Processed frame {t+1}/{seq_length} through CNN.")

        # Stack the CNN outputs and pass through LSTM
        cnn_out = torch.stack(cnn_out, dim=1)
        print("Stacked CNN outputs.")
        lstm_out, _ = self.lstm(cnn_out)
        print("Passed through LSTM.")

        # Take the output of the last LSTM cell
        lstm_out = lstm_out[:, -1, :]
        print("Extracted output from the last LSTM cell.")

        # Pass through the fully connected layer
        out = self.fc(lstm_out)
        print("Passed through the fully connected layer.")
        print("Forward pass completed.")
        return out


# query processing

import os
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load preprocessed data and labels
video_data = torch.load('video_data.pt', weights_only=True)
labels = torch.load('labels.pt', weights_only=True)

print("Data and labels loaded.")

import os

# Function to perform exact match
def exact_match(query, labels):
    if query in labels:
        index = labels.index(query)
        return video_data[index]
    else:
        return None




# Function to get BERT embeddings for a word
def get_bert_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Function to find the most similar word using BERT
def similar_word_search(query, labels):
    query_embedding = get_bert_embedding(query)
    label_embeddings = [get_bert_embedding(label) for label in labels]

    similarities = [cosine_similarity(query_embedding, label_embedding)[0][0] for label_embedding in label_embeddings]
    max_similarity_index = np.argmax(similarities)

    if similarities[max_similarity_index] > 0.7:  # Threshold for similarity
        return labels[max_similarity_index]
    else:
        return None



import torch

# Load the labels from the .pt file
labels_path = "labels.pt"  # Replace with the actual path to labels.pt
labels = torch.load(labels_path)

# Print the loaded labels
print(labels)



# # Example usage
# query = "butcher"
# labels = ["chef", "cook", "carpenter", "butcher"]  # Example dictionary of words

query = "butcher"

video_frames = exact_match(query, labels)

if not video_frames:
    print(f"Exact match not found for '{query}'. Searching for similar words...")
    similar_word = similar_word_search(query, labels)
    if similar_word:
        video_frames = exact_match(similar_word, labels)
        print(f"Similar word found: '{similar_word}', corresponding video frames retrieved.")
    else:
        print("No similar word found.")
else:
    print(f"Exact match found for '{query}', corresponding video frames retrieved.")


ModuleNotFoundError: No module named 'transformers'