In [4]:
import torch
import clip
import cv2
import numpy as np
from PIL import Image
from scipy.spatial.distance import cosine

# Load the CLIP model and preprocess function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# Define action descriptions (These can be changed or extended)
actions = [
    "A person running",
    "A person jumping",
    "A person sitting",
    "A person walking",
    "A person standing"
]

# Encode action descriptions into text embeddings
text_inputs = torch.cat([clip.tokenize(action) for action in actions]).to(device)
with torch.no_grad():
    text_embeddings = model.encode_text(text_inputs)

# Normalize the text embeddings to unit vectors
text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)

# Initialize video capture from webcam (0 is the default camera on most systems)
cap = cv2.VideoCapture(0)  # Use '0' for the built-in webcam

if not cap.isOpened():
    print("Error: Unable to access the camera.")
    exit()

frame_skip = 3  # Process every 3rd frame

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture frame.")
        break
    
    # Skip frames to reduce processing load
    if cap.get(cv2.CAP_PROP_POS_FRAMES) % frame_skip != 0:
        continue

    # Resize the frame to match CLIP input size (224x224)
    frame_resized = cv2.resize(frame, (224, 224))

    # Preprocess the frame
    pil_image = Image.fromarray(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
    image_input = preprocess(pil_image).unsqueeze(0).to(device)

    # Get the image embedding from CLIP
    with torch.no_grad():
        image_embedding = model.encode_image(image_input)

    # Normalize the image embedding
    image_embedding /= image_embedding.norm(dim=-1, keepdim=True)

    # Flatten the image embedding to 1D
    image_embedding = image_embedding.cpu().numpy().flatten()

    # Convert text embeddings to numpy and flatten them
    text_embeddings_np = text_embeddings.cpu().numpy()

    # Calculate cosine similarity between image embedding and text embeddings
    similarities = []
    for text_embedding in text_embeddings_np:
        similarity = 1 - cosine(image_embedding, text_embedding.flatten())
        similarities.append(similarity)

    # Find the index of the most similar action description
    most_similar_idx = np.argmax(similarities)
    predicted_action = actions[most_similar_idx]

    # Display the action on the frame
    cv2.putText(frame, f"Predicted Action: {predicted_action}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show the frame
    cv2.imshow('Real-Time Action Recognition', frame)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [7]:
import torch
import clip
import cv2
import numpy as np
from PIL import Image
from scipy.spatial.distance import cosine

# Load the CLIP model and preprocessing function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# Define action descriptions
actions = [
    "A person running",
    "A person jumping",
    "A person sitting",
    "A person walking",
    "A person standing"
]

# Encode action descriptions into text embeddings
text_inputs = torch.cat([clip.tokenize(action) for action in actions]).to(device)
with torch.no_grad():
    text_embeddings = model.encode_text(text_inputs)

# Normalize the text embeddings to unit vectors
text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)

# Initialize video capture
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Unable to access the camera.")
    exit()

while True:
    ret, frame = cap.read()

    # Check if a frame was successfully captured
    if not ret:
        print("Error: Failed to capture frame.")
        break
    
    print(f"Captured frame: {frame.shape}")  # Debug: print the shape of the captured frame

    # Resize the frame to match CLIP input size
    frame_resized = cv2.resize(frame, (224, 224))

    # Preprocess the frame
    pil_image = Image.fromarray(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
    image_input = preprocess(pil_image).unsqueeze(0).to(device)

    try:
        # Get the image embedding from CLIP
        with torch.no_grad():
            image_embedding = model.encode_image(image_input)

        # Normalize the image embedding
        image_embedding /= image_embedding.norm(dim=-1, keepdim=True)

        # Flatten the image embedding to 1D
        image_embedding = image_embedding.cpu().numpy().flatten()

        # Convert text embeddings to numpy and flatten them
        text_embeddings_np = text_embeddings.cpu().numpy()

        # Calculate cosine similarity between image embedding and text embeddings
        similarities = []
        for text_embedding in text_embeddings_np:
            similarity = 1 - cosine(image_embedding, text_embedding.flatten())
            similarities.append(similarity)

        # Find the index of the most similar action description
        most_similar_idx = np.argmax(similarities)
        predicted_action = actions[most_similar_idx]

        print(f"Predicted Action: {predicted_action}")  # Debug: print the predicted action

    except Exception as e:
        print(f"Error processing the frame: {e}")
        continue  # Skip the frame if there's an error and move to the next one

    # Display the action on the frame
    font = cv2.FONT_HERSHEY_SIMPLEX
    frame_with_text = cv2.putText(frame, f"Predicted Action: {predicted_action}", (10, 30),
                                  font, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # Show the frame
    cv2.imshow('Real-Time Action Recognition', frame_with_text)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person standing
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person standing
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 640, 3)
Predicted Action: A person sitting
Captured frame: (480, 6