<a href="https://colab.research.google.com/github/Khushi-MA/trialreu1/blob/main/sept7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Khushi-MA/trialreu1.git

Cloning into 'trialreu1'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 18 (delta 0), reused 3 (delta 0), pack-reused 15 (from 1)[K
Receiving objects: 100% (18/18), 78.05 MiB | 29.62 MiB/s, done.


In [33]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# %cd trialreu1
!pwd
!ls

/content/trialreu1
Data1  labels.pt  main.ipynb  README.md  video_data.pt


In [34]:
import os
import cv2
import numpy as np
import torch
from torchvision import transforms


In [35]:
# Step 1: Frame Extraction
def extract_frames(video_path, frame_rate=1):
    print(f"Extracting frames from {video_path}...")
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frames.append(frame)
        count += 1
    cap.release()
    print(f"Extracted {len(frames)} frames from {video_path}.")
    return frames

# Step 2: Normalization & Resizing
def preprocess_frame(frame, size=(224, 224)):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    frame = transform(frame)
    # print("Frame preprocessed.")
    return frame

# Step 3: Augmentation (optional)
def augment_frames(frames):
    print("Augmenting frames...")
    transform = transforms.Compose([
        transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0))
    ])
    augmented_frames = []
    for frame in frames:
        frame = transform(frame)
        augmented_frames.append(frame)
    print(f"Augmented {len(augmented_frames)} frames.")
    return augmented_frames

# Step 4: Label Assignment
def get_label_from_filename(filename):
    label = filename.split()[1].split('.')[0]
    print(f"Assigned label '{label}' to {filename}.")
    return label

# Main Preprocessing Function
def preprocess_videos(directory):
    print(f"Preprocessing videos in directory: {directory}")
    video_data = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".mp4"):
            video_path = os.path.join(directory, filename)
            frames = extract_frames(video_path)
            frames = [preprocess_frame(frame) for frame in frames]
            frames = augment_frames(frames)  # Optional
            label = get_label_from_filename(filename)
            video_data.append(frames)
            labels.append(label)
    print("Preprocessing complete.")
    return video_data, labels



In [36]:
# Example usage
video_directory = 'Data1/Words/'
video_data, labels = preprocess_videos(video_directory)

# Save preprocessed data for later use
torch.save(video_data, 'video_data.pt')
torch.save(labels, 'labels.pt')
print("Data saved.")

Preprocessing videos in directory: Data1/Words/
Extracting frames from Data1/Words/a0 butcher.mp4...
Extracted 58 frames from Data1/Words/a0 butcher.mp4.
Augmenting frames...
Augmented 58 frames.
Assigned label 'butcher' to a0 butcher.mp4.
Extracting frames from Data1/Words/a1 aadhar_card.mp4...
Extracted 94 frames from Data1/Words/a1 aadhar_card.mp4.
Augmenting frames...
Augmented 94 frames.
Assigned label 'aadhar_card' to a1 aadhar_card.mp4.
Extracting frames from Data1/Words/a3 dedication.mp4...
Extracted 61 frames from Data1/Words/a3 dedication.mp4.
Augmenting frames...
Augmented 61 frames.
Assigned label 'dedication' to a3 dedication.mp4.
Extracting frames from Data1/Words/a2 cinematography.mp4...
Extracted 72 frames from Data1/Words/a2 cinematography.mp4.
Augmenting frames...
Augmented 72 frames.
Assigned label 'cinematography' to a2 cinematography.mp4.
Preprocessing complete.
Data saved.


# 2. Model Architecture (Isolated Sign Recognition):
This model will be trained on your isolated dictionary

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models


In [40]:
class SignRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(SignRecognitionModel, self).__init__()
        # Use a pre-trained 2D-CNN (e.g., ResNet) as the feature extractor
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final fully connected layer

        # LSTM to capture temporal relationships
        self.lstm = nn.LSTM(input_size=512, hidden_size=256, num_layers=2, batch_first=True)

        # Fully connected layer for classification
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        print("Forward pass started.")
        batch_size, seq_length, c, h, w = x.size()
        cnn_out = []

        # Pass each frame through the CNN
        for t in range(seq_length):
            frame = x[:, t, :, :, :]
            cnn_out.append(self.cnn(frame))
            print(f"Processed frame {t+1}/{seq_length} through CNN.")

        # Stack the CNN outputs and pass through LSTM
        cnn_out = torch.stack(cnn_out, dim=1)
        print("Stacked CNN outputs.")
        lstm_out, _ = self.lstm(cnn_out)
        print("Passed through LSTM.")

        # Take the output of the last LSTM cell
        lstm_out = lstm_out[:, -1, :]
        print("Extracted output from the last LSTM cell.")

        # Pass through the fully connected layer
        out = self.fc(lstm_out)
        print("Passed through the fully connected layer.")
        print("Forward pass completed.")
        return out




In [41]:

# Example usage
num_classes = 4  # Replace with the actual number of classes
model = SignRecognitionModel(num_classes)

In [42]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [43]:
# Example training loop
def train_model(model, dataloader, criterion, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(dataloader.dataset)
        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}')

# Assuming you have a DataLoader `dataloader` for your dataset
# train_model(model, dataloader, criterion, optimizer)

# 3. Query Processing (Text Matching):

In [None]:
!pip install torch transformers scikit-learn

# search exact word

In [45]:
import os
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


In [46]:
import torch

# Load preprocessed data and labels
video_data = torch.load('video_data.pt')
labels = torch.load('labels.pt')

print("Data and labels loaded.")

Data and labels loaded.


  video_data = torch.load('video_data.pt')
  labels = torch.load('labels.pt')


In [60]:
import os

# Function to perform exact match
def exact_match(query, labels):
    if query in labels:
        index = labels.index(query)
        return video_data[index]
    else:
        return None


In [61]:

# Example usage
query = "butcher"
video_frames = exact_match(query, labels)
if video_frames:
    print(f"Exact match found for '{query}'.")
else:
    print("Exact match not found.")

Exact match found for 'butcher'.


In [63]:

# Example usage
query = "camerawork"
video_frames = exact_match(query, labels)
if video_frames:
    print(f"Exact match found for '{query}'.")
else:
    print("Exact match not found for '{query}'.")

Exact match not found for '{query}'.


# similar search (word)

In [48]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [49]:

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [55]:

# Function to get BERT embeddings for a word
def get_bert_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()


In [57]:

# Function to find the most similar word using BERT
def similar_word_search(query, labels):
    query_embedding = get_bert_embedding(query)
    label_embeddings = [get_bert_embedding(label) for label in labels]

    similarities = [cosine_similarity(query_embedding, label_embedding)[0][0] for label_embedding in label_embeddings]
    max_similarity_index = np.argmax(similarities)

    if similarities[max_similarity_index] > 0.7:  # Threshold for similarity
        return labels[max_similarity_index]
    else:
        return None

In [67]:

# Example usage 1
query = "camerawork"

if not video_frames:
    similar_word = similar_word_search(query, labels)
    if similar_word:
        video_frames = exact_match(similar_word, labels)
        print(f"Similar word found: '{query}', corresponding video frames retrieved.")
    else:
        print("No similar word found.")

In [65]:

# Example usage 2
if not video_frames:
    similar_word = similar_word_search(query, labels)
    if similar_word:
        video_frames = exact_match(similar_word, labels)
        print(f"Similar word found: '{ocean}', corresponding video frames retrieved.")
    else:
        print("No similar word found.")