In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torch torchvision opencv-python



In [None]:
import cv2
import os

# Function to extract 1 frame every 30 seconds from a video
def extract_frames(video_path, output_dir, interval_seconds=30):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get the frame rate of the video
    fps = cap.get(cv2.CAP_PROP_FPS)  # Frames per second
    frame_interval = int(fps * interval_seconds)  # Number of frames between each capture (30 seconds interval)

    count = 0  # Frame counter
    frame_id = 0  # To name the frames sequentially
    video_name = os.path.basename(video_path).split('.')[0]  # Get the base name of the video

    while cap.isOpened():
        ret, frame = cap.read()  # Read each frame
        if not ret:
            break  # If no more frames, break out of the loop

        if count % frame_interval == 0:
            # Save the frame as an image file in the specified directory
            frame_filename = f"{output_dir}/{video_name}_frame_{frame_id:04d}.jpg"  # Name the frame with video name
            cv2.imwrite(frame_filename, frame)
            frame_id += 1

        count += 1

    cap.release()  # Release the video capture object
    print(f"Extracted {frame_id} frames from {video_path} (1 frame every {interval_seconds} seconds)")

# Function to extract frames from multiple videos into a single directory
def extract_frames_from_videos(video_paths, output_dir, interval_seconds=30):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for video_path in video_paths:
        print(f"Extracting frames from {video_path}...")
        extract_frames(video_path, output_dir, interval_seconds)

# List of 5 video file paths
video_paths = [
    '/content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926155415_20240926155424.mp4',
    '/content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926160938_20240926160948.mp4',
    '/content/drive/MyDrive/dataset_modified/suspicious/eyetrack_ch1_20240926165809_20240926165823.mp4',
    '/content/drive/MyDrive/dataset_modified/suspicious/eyetrack_ch3_20240926155258_20240926155308.mp4'

]

# Define the output directory for all frames
output_dir = '/content/ouput directory'

# Run the function for all 5 videos and save all frames in the same directory
extract_frames_from_videos(video_paths, output_dir, interval_seconds=1)  # Extract 1 frame every 30 seconds

Extracting frames from /content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926155415_20240926155424.mp4...
Extracted 10 frames from /content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926155415_20240926155424.mp4 (1 frame every 1 seconds)
Extracting frames from /content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926160938_20240926160948.mp4...
Extracted 10 frames from /content/drive/MyDrive/dataset_modified/sleeping/eyetrack_ch1_20240926160938_20240926160948.mp4 (1 frame every 1 seconds)
Extracting frames from /content/drive/MyDrive/dataset_modified/suspicious/eyetrack_ch1_20240926165809_20240926165823.mp4...
Extracted 15 frames from /content/drive/MyDrive/dataset_modified/suspicious/eyetrack_ch1_20240926165809_20240926165823.mp4 (1 frame every 1 seconds)
Extracting frames from /content/drive/MyDrive/dataset_modified/suspicious/eyetrack_ch3_20240926155258_20240926155308.mp4...
Extracted 10 frames from /content/drive/MyDrive/dataset_modified/

In [None]:
pip install matplotlib

In [None]:
!pip install torch torchvision torchaudio
!pip install opencv-python
!pip install matplotlib
!pip install pyyaml
!pip install numpy



In [None]:
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5
!pip install -r requirements.txt

Cloning into 'yolov5'...
remote: Enumerating objects: 17067, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 17067 (delta 24), reused 28 (delta 12), pack-reused 17022 (from 1)[K
Receiving objects: 100% (17067/17067), 15.68 MiB | 19.25 MiB/s, done.
Resolving deltas: 100% (11714/11714), done.
/content/yolov5
Collecting thop>=0.1.1 (from -r requirements.txt (line 14))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting ultralytics>=8.2.34 (from -r requirements.txt (line 18))
  Downloading ultralytics-8.3.40-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics>=8.2.34->-r requirements.txt (line 18))
  Downloading ultralytics_thop-2.0.12-py3-none-any.whl.metadata (9.4 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Downloading ultralytics-8.3.40-py3-none-any.whl (898 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [None]:
import torch
# Load YOLOv5 model (pre-trained on COCO dataset)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # or 'yolov5m', 'yolov5l', 'yolov5x' for larger models

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


YOLOv5 🚀 2024-12-3 Python-3.10.12 torch-2.5.1+cu121 CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:00<00:00, 91.8MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [None]:

import cv2
import os
from ultralytics import YOLO  # Ensure you're using the correct YOLO library

def annotate_frames_with_yolo(frame_directory, output_directory, model_path, class_names):
    # Load the fine-tuned YOLO model
    model = YOLO(model_path)  # Load your trained YOLO model

    # Create output directory for annotations
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for frame_file in os.listdir(frame_directory):
        frame_path = os.path.join(frame_directory, frame_file)

        # Read the frame
        img = cv2.imread(frame_path)

        # Use the model to predict
        results = model(img)  # Perform inference

        # Process each result in the list
        for result in results:  # results is now a list
            boxes = result.boxes  # Extract bounding boxes
            annotations = []

            # Save annotations to a text file
            annotation_file = os.path.join(output_directory, frame_file.replace('.jpg', '.txt'))
            with open(annotation_file, 'w') as f:
                for box in boxes:
                    cls = int(box.cls[0])  # Class ID
                    class_name = class_names[cls]
                    xmin, ymin, xmax, ymax = map(int, box.xyxy[0].tolist())
                    confidence = box.conf[0]  # Confidence score

                    # Write annotation in the desired format
                    f.write(f"{class_name} {xmin} {ymin} {xmax} {ymax} {confidence:.2f}\n")
                    annotations.append((class_name, xmin, ymin, xmax, ymax, confidence))

            # Optionally, visualize the results with correct class names
            annotated_frame = img.copy()
            for class_name, xmin, ymin, xmax, ymax, confidence in annotations:
                # Draw bounding boxes and add labels
                cv2.rectangle(annotated_frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                label = f"{class_name} ({confidence:.2f})"
                cv2.putText(annotated_frame, label, (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            # Save the annotated frame
            cv2.imwrite(os.path.join(output_directory, f"annotated_{frame_file}"), annotated_frame)

    print(f"Annotations saved in {output_directory}")

# Directory where frames are saved
frame_directory = '/content/drive/MyDrive/image.v1i.yolov8/train/images'  # Replace with your extracted frames directory
output_directory = '/content/output_directory'  # Output directory for annotations
model_path = '/content/drive/MyDrive/project/runs/detect/yolo_activity_detection/weights/best.pt'  # Path to your fine-tuned YOLO model

# Define your class names
class_names = ['mobile','sleeping','suspicious']  # Ensure these match your dataset

# Annotate frames using YOLO
annotate_frames_with_yolo(frame_directory, output_directory, model_path, class_names)



0: 640x640 1 sleeping, 314.1ms
Speed: 12.4ms preprocess, 314.1ms inference, 21.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 272.7ms
Speed: 2.9ms preprocess, 272.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 433.0ms
Speed: 3.1ms preprocess, 433.0ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 302.0ms
Speed: 4.1ms preprocess, 302.0ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 293.9ms
Speed: 3.1ms preprocess, 293.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 295.8ms
Speed: 5.5ms preprocess, 295.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 sleeping, 248.7ms
Speed: 4.6ms preprocess, 248.7ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 174.2ms
Speed: 4.2ms preprocess, 174.2ms inference, 0.7ms p

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg16 import preprocess_input
import numpy as np
import cv2
import os

# Load the pre-trained VGG16 model and remove the classification head
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 0us/step


In [None]:
def preprocess_frame(frame):
    # Resize frame to 224x224 as required by VGG16
    frame_resized = cv2.resize(frame, (224, 224))

    # Convert the frame to an array and preprocess it
    frame_array = np.array(frame_resized)
    frame_preprocessed = preprocess_input(frame_array)

    # Add batch dimension (VGG16 expects a batch of images, even if it's just one image)
    frame_preprocessed = np.expand_dims(frame_preprocessed, axis=0)

    return frame_preprocessed

In [None]:
def extract_features_from_frames(frame_directory, output_feature_dir):
    if not os.path.exists(output_feature_dir):
        os.makedirs(output_feature_dir)

    for frame_file in os.listdir(frame_directory):
        frame_path = os.path.join(frame_directory, frame_file)

        # Read the frame
        frame = cv2.imread(frame_path)

        # Preprocess the frame
        preprocessed_frame = preprocess_frame(frame)

        # Extract features using the VGG16 model
        features = model.predict(preprocessed_frame)

        # Save features as a .npy file (numpy array)
        feature_file = os.path.join(output_feature_dir, frame_file.replace('.jpg', '.npy'))
        np.save(feature_file, features)

        print(f"Features extracted and saved for {frame_file}")

# Directory containing extracted frames
frame_directory = '/content/drive/MyDrive/image.v1i.yolov8/train/images'

# Directory to save the extracted features
output_feature_dir = '/content/ouput directory'

# Extract features from frames
extract_features_from_frames(frame_directory, output_feature_dir)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 776ms/step
Features extracted and saved for eyetrack_ch1_20240926155328_20240926155338_mp4-0005_jpg.rf.5b22bf64379ef03b8d94d1b804133966.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 531ms/step
Features extracted and saved for eyetrack_ch1_20240926163352_20240926163406_mp4-0005_jpg.rf.70e801a5a6e654ec36ab6984e182c893.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 658ms/step
Features extracted and saved for eyetrack_ch1_20240926161001_20240926161011_mp4-0005_jpg.rf.f701df5e160f094aa2faac699463465a.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 527ms/step
Features extracted and saved for eyetrack_ch1_20240926161001_20240926161011_mp4-0000_jpg.rf.b8d09ff979f2a4e2d51289cc6d460b74.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 518ms/step
Features extracted and saved for eyetrack_ch1_20240926161001_20240926161011_mp4-0002_jpg.rf.69e9d7f7cdf9395ff8ac5ef724a

In [None]:
import os
import pandas as pd
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from PIL import Image
import torch

# Load pre-trained image captioning model (Vision Transformer + GPT2)
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Function to generate captions from an image (frame)
def generate_caption(image_path):
    image = Image.open(image_path)
    pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Specify the directory containing frame images
frames_directory = '/content/drive/MyDrive/image.v1i.yolov8/train/images'
captions = []

# Loop through each image in the frames directory
for filename in os.listdir(frames_directory):
    if filename.endswith(('.jpg', '.jpeg', '.png')):  # Add other image formats if needed
        image_path = os.path.join(frames_directory, filename)
        caption = generate_caption(image_path)
        captions.append({'filename': filename, 'caption': caption})

# Save the captions to a CSV file
captions_df = pd.DataFrame(captions)
output_csv_path = "captions_output.csv"
captions_df.to_csv(output_csv_path, index=False)

print(f"Captions have been saved to {output_csv_path}")


config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Captions have been saved to captions_output.csv


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode (no training)
model.eval()

# Function to extract textual features from a list of captions
def extract_textual_features(captions):
    features = []

    with torch.no_grad():  # No need for gradients, we are only extracting features
        for caption in captions:
            # Tokenize the input caption
            inputs = tokenizer(caption, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

            # Extract BERT embeddings
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state

            # Aggregate the token embeddings (mean pooling)
            caption_embedding = torch.mean(last_hidden_state, dim=1).squeeze().numpy()

            # Add the caption embedding to the features list
            features.append(caption_embedding)

    return features

# Path to your CSV file containing the captions
csv_file_path = '/content/yolov5/captions_output.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Assuming the CSV has a column named 'caption' that contains the captions
captions = df['caption'].tolist()

# Extract textual features for all captions in the CSV file
textual_features = extract_textual_features(captions)

# Convert the features to a numpy array for further processing or saving
textual_features = np.array(textual_features)

# Optionally, save the extracted features to a file for later use
np.save('textual_features.npy', textual_features)

# Print the shape of the extracted features
print(f"Extracted textual features shape: {textual_features.shape}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Extracted textual features shape: (203, 768)


In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import os

# Directory of extracted frames
frames_dir = '/content/ouput directory'

# Directory to save visual features
features_dir = '/content/output_visual/features'
os.makedirs(features_dir, exist_ok=True)

# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)
model = model.eval()  # Set model to evaluation mode

# Transform for preprocessing the image
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match input size of model
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Iterate through all frames and extract features
for frame_file in sorted(os.listdir(frames_dir)):
    if frame_file.endswith('.jpg'):
        # Load and preprocess the image
        frame_path = os.path.join(frames_dir, frame_file)
        image = Image.open(frame_path).convert('RGB')
        input_tensor = preprocess(image).unsqueeze(0)  # Add batch dimension

        # Extract features
        with torch.no_grad():
            features = model(input_tensor).numpy()  # Shape: (1, 1000) for ResNet

        # Save features as .npy
        feature_path = os.path.join(features_dir, frame_file.replace('.jpg', '.npy'))
        np.save(feature_path, features)

print(f"Extracted features saved to {features_dir}")


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 132MB/s]


Extracted features saved to /content/output_visual/features


In [None]:

import os
import numpy as np

# Path to the directory containing the visual features
visual_features_dir = '/content/output_visual/features'

# Load the textual features (stored in a single .npy file)
textual_features = np.load('textual_features.npy')  # Shape: (num_frames, 768)

# Get a sorted list of visual feature files in the directory
visual_feature_files = sorted([f for f in os.listdir(visual_features_dir) if f.endswith('.npy')])

# Check the number of visual and textual features
num_visual_features = len(visual_feature_files)
num_textual_features = textual_features.shape[0]

print(f"Number of visual features: {num_visual_features}")
print(f"Number of textual features: {num_textual_features}")

# Truncate textual features to match the number of visual features if necessary
if num_visual_features < num_textual_features:
    print(f"Truncating textual features from {num_textual_features} to {num_visual_features}")
    textual_features = textual_features[:num_visual_features]
elif num_visual_features > num_textual_features:
    raise ValueError("More visual features than textual features. Check your data alignment!")

# Combine visual and textual features
combined_features = []

# Iterate through each visual feature file and combine with corresponding textual feature
for i, visual_feature_file in enumerate(visual_feature_files):
    # Load the visual feature for the current frame
    visual_feature_path = os.path.join(visual_features_dir, visual_feature_file)
    visual_feature = np.load(visual_feature_path)  # Example shape: (1, visual_feature_dim) or (visual_feature_dim,)

    # Ensure the visual feature is flattened to 1D
    visual_feature_flat = visual_feature.flatten()  # This ensures it's 1D

    # Get the corresponding textual feature for this frame (already 1D, shape: (768,))
    textual_feature = textual_features[i]

    # Concatenate the visual and textual features
    combined_feature = np.concatenate((visual_feature_flat, textual_feature), axis=0)

    # Append the combined feature to the list
    combined_features.append(combined_feature)

# Convert the list of combined features to a numpy array
combined_features = np.array(combined_features)

# Optionally, save the combined features to a file
np.save('combined_features.npy', combined_features)

# Print the shape of the combined features
print(f"Combined features shape: {combined_features.shape}")


Number of visual features: 45
Number of textual features: 203
Truncating textual features from 203 to 45
Combined features shape: (45, 1768)


In [None]:
print(f"Number of visual feature files: {len(visual_feature_files)}")
import numpy as np

textual_features = np.load('textual_features.npy')
print(f"Number of textual features: {textual_features.shape[0]}")


Number of visual feature files: 45
Number of textual features: 203


In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

# Function to embed a question using BERT
def embed_question(question):
    with torch.no_grad():
        inputs = tokenizer(question, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
        outputs = model(**inputs)
        question_embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    return question_embedding

# Function to apply attention mechanism and match question to frames
def match_question_to_frames(question_embedding, combined_features):
    """
    Args:
        question_embedding: (768,) numpy array, BERT embedding of the question.
        combined_features: (num_frames, feature_dim) numpy array, combined visual-textual features for frames.

    Returns:
        relevance_scores: (num_frames,) numpy array, attention scores for each frame.
    """
    # Assuming textual features are the last 768 elements of combined_features
    textual_features = combined_features[:, -768:]

    # Normalize the question embedding and textual features
    question_embedding_norm = question_embedding / np.linalg.norm(question_embedding)
    textual_features_norm = textual_features / np.linalg.norm(textual_features, axis=1, keepdims=True)

    # Compute attention scores using dot product between question and frame features
    relevance_scores = np.dot(textual_features_norm, question_embedding_norm) # Use textual_features_norm

    return relevance_scores

# Load the combined features (from previous step)
combined_features = np.load('combined_features.npy')  # Shape: (num_frames, feature_dim)

# Define your question
question = "who is standing"

# Embed the question using BERT
question_embedding = embed_question(question)

# Match the question to the relevant frames using attention mechanism
relevance_scores = match_question_to_frames(question_embedding, combined_features)

# Sort the frames by relevance (highest scores first)
top_frame_indices = np.argsort(relevance_scores)[::-1]  # Indices of frames sorted by relevance score

# Print top 5 relevant frames
print("Top 5 relevant frames (sorted by relevance):")
for i in top_frame_indices[:5]:
    print(f"Frame {i}, Relevance Score: {relevance_scores[i]}")

# Optionally, save the relevance scores to a file
np.save('relevance_scores.npy', relevance_scores)

Top 5 relevant frames (sorted by relevance):
Frame 26, Relevance Score: 0.6899492740631104
Frame 42, Relevance Score: 0.6894392371177673
Frame 34, Relevance Score: 0.6880147457122803
Frame 40, Relevance Score: 0.6866772174835205
Frame 16, Relevance Score: 0.6834909915924072


In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model for text generation
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Select the top-k relevant captions from the frames (assuming captions are loaded)
captions = [
    "A person sleeping.",
    "A man showing suspicious behavior at the door.",
    "A man sitting at a desk working on a laptop.",
    "A person walking with a phone in hand.",
    "A group of people discussing something."
]
top_k = 3

# Concatenate the captions of top-k frames to create a context
context = " ".join(captions[:top_k])

# Define a sample question
question = "What is the person doing?"

# Create the input for the model (concatenating the question and the context from frames)
input_text = f"question: {question} context: {context}"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

# Generate the answer
output = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)

# Decode the generated answer
generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Question: {question}")
print(f"Generated Answer: {generated_answer}")


Question: What is the person doing?
Generated Answer: sleeping


In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model for text generation
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Sample captions from frames
captions = [
    "A person sleeping.",
    "A man showing suspicious behavior at the door.",
    "A man sitting at a desk working on a laptop.",
    "A person walking with a phone in hand.",
    "A group of people discussing something."
]

# List of custom questions you want to ask
questions = [
    "What is the person doing?",
    "Is there anyone sitting on a chair?",
    "Is someone holding a suitcase?",
    "Who is looking at the laptop?",
    "Is anyone using a phone?"
]

# Set the number of top-k captions to use in the context
top_k = 3  # Adjust this value as needed

# Concatenate the captions of top-k frames to create a context
context = " ".join(captions[:top_k])

# Loop through each custom question and generate an answer
for question in questions:
    # Create the input for the model by combining the question and context
    input_text = f"question: {question} context: {context}"

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate the answer
    output = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)

    # Decode the generated answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print the question and the generated answer
    print(f"Question: {question}")
    print(f"Answer: {generated_answer}\n")


Question: What is the person doing?
Answer: sleeping

Question: Is there anyone sitting on a chair?
Answer: A person sleeping

Question: Is someone holding a suitcase?
Answer: sleeping

Question: Who is looking at the laptop?
Answer: A man

Question: Is anyone using a phone?
Answer: sleeping



In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model for text generation
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Sample captions from frames
captions = [
    "A person sleeping.",
    "A man showing suspicious behavior at the door.",
    "A man sitting at a desk working on a laptop.",
    "A person walking with a phone in hand.",
    "A group of people discussing something."
    "A room with desk and a chair.",
    "A person standing in a doorway with a suitcase.",
    "A man in a suit looking at the laptop.",
    "A man is sitting on a chair and working on a computer.",
    "A man is talking on a phone.",
]

# List of questions to be answered
questions = [
    "What is the person doing?",
    "Is there any suspecious activity"
]

# Set the number of top-k captions to use in the context
top_k = 3  # Adjust this value as needed

# Concatenate the captions of top-k frames to create a context
context = " ".join(captions[:top_k])

# Loop through each question and generate an answer
for question in questions:
    # Create the input for the model by combining the question and context
    input_text = f"question: {question} context: {context}"

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate the answer
    output = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)

    # Decode the generated answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print the question and the generated answer
    print(f"Question: {question}")
    print(f"Answer: {generated_answer}\n")


Question: What is the person doing?
Answer: sleeping

Question: Is there any suspecious activity
Answer: A man showing suspicious behavior



In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model for text generation
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Sample captions from frames in the lab scenario
captions = [
    "A person opening a rack in the lab.",
    "A person sleeping on a lab bench.",
    "A person holding a mobile in the lab.",
    "A person using a mobile phone in the lab.",
    "A person working on a computer in the lab.",
    "A person standing near a rack in the lab.",
]

# List of custom questions based on lab activities
questions = [
    "Is the person opening the rack?",
    "Is the person sleeping?",
    "What is the person holding in their hands?",
    "Is the person using a mobile phone?",
    "Is the person sitting at a desk?",
    "Is the person working on a computer?",
    "Is the person talking to someone?",
    "Is the person standing near a rack?",
    "Is the person reading a book?"
]

# Set the number of top-k captions to use in the context
top_k = 3  # Adjust this value as needed

# Loop through each custom question and generate an answer
for question in questions:
    # Concatenate the captions of top-k frames to create a context
    context = " ".join(captions[:top_k])

    # Create the input for the model by combining the question and context
    input_text = f"question: {question} context: {context}"

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate the answer
    output = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)

    # Decode the generated answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print the question and the generated answer
    print(f"Question: {question}")
    print(f"Answer: {generated_answer}\n")
