In [1]:
import os
import cv2
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer, util

# Disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Initialize the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Initialize the semantic similarity model
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def frame_to_text(frame):
    """Convert a single frame to text using the model."""
    image = Image.fromarray(frame)
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=50)  # Specify max_new_tokens
    return processor.decode(out[0], skip_special_tokens=True)

def get_unique_meanings(texts, threshold=0.8):
    """Filter out texts that have similar meanings based on semantic similarity."""
    unique_texts = []
    for text in texts:
        text_embedding = semantic_model.encode(text, convert_to_tensor=True)
        is_unique = True
        for unique_text in unique_texts:
            unique_text_embedding = semantic_model.encode(unique_text, convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(text_embedding, unique_text_embedding).item()
            if similarity > threshold:
                is_unique = False
                break
        if is_unique:
            unique_texts.append(text)
    return unique_texts

def video_to_text(video_path, repeat_threshold=5):
    """Process video frames and generate meaningful text descriptions based on repeated actions."""
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    current_action = None
    action_count = 0
    frame_texts = []
    
    while True:
        # Read each frame from the video
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert frame to text
        text = frame_to_text(frame)
        
        if text == current_action:
            action_count += 1
        else:
            if action_count >= repeat_threshold and current_action is not None:
                frame_texts.append(f"There was'{current_action}'.")
            current_action = text
            action_count = 1
    
    # Append the last action if it meets the threshold
    if action_count >= repeat_threshold and current_action is not None:
        frame_texts.append(f"The action '{current_action}'.")
    
    cap.release()
    
    # Filter out similar meanings
    unique_frame_texts = get_unique_meanings(frame_texts)
    
    # Combine all frame texts into meaningful sentences
    video_description = " ".join(unique_frame_texts)
    return video_description

# Example usage
video_path = "/Users/ronny/Downloads/AI_Final/testvideo.mp4"
description = video_to_text(video_path)
print(description)


There was'a man is standing next to a car'. There was'a man is walking across the street with a car'. There was'a horse race is being watched by a camera'. There was'a man is playing with a small dog'. There was'a man in a white shirt and black pants'. There was'a man in a white shirt and black pants is playing with a frc'. There was'a woman in a white dress is playing golf'. There was'a man in a white shirt and pants playing a game of cricket'.


In [2]:
from gtts import gTTS
import os

def text_to_speech(text):
    tts = gTTS(text=text, lang='en')
    tts.save("output.mp3")
    os.system("afplay output.mp3")  
    
text_to_speech(description)


In [3]:
import os
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from googletrans import Translator
from gtts import gTTS

# Disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Initialize the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Initialize the semantic similarity model
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def frame_to_text(frame):
    """Convert a single frame to text using the model."""
    image = Image.fromarray(frame)
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=50)
    return processor.decode(out[0], skip_special_tokens=True)

def get_unique_meanings(texts, threshold=0.8):
    """Filter out texts that have similar meanings based on semantic similarity."""
    unique_texts = []
    for text in texts:
        text_embedding = semantic_model.encode(text, convert_to_tensor=True)
        is_unique = True
        for unique_text in unique_texts:
            unique_text_embedding = semantic_model.encode(unique_text, convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(text_embedding, unique_text_embedding).item()
            if similarity > threshold:
                is_unique = False
                break
        if is_unique:
            unique_texts.append(text)
    return unique_texts

def video_to_text(video_path, repeat_threshold=5):
    """Process video frames and generate meaningful text descriptions based on repeated actions."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    current_action = None
    action_count = 0
    frame_texts = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        text = frame_to_text(frame)

        if text == current_action:
            action_count += 1
        else:
            if action_count >= repeat_threshold and current_action is not None:
                frame_texts.append(f"There was '{current_action}'.")
            current_action = text
            action_count = 1

    if action_count >= repeat_threshold and current_action is not None:
        frame_texts.append(f"The action '{current_action}'.")

    cap.release()

    unique_frame_texts = get_unique_meanings(frame_texts)
    video_description = " ".join(unique_frame_texts)
    return video_description

def translate_text(text, target_lang='ha'):
    translator = Translator()
    translation = translator.translate(text, dest=target_lang)
    return translation.text

def text_to_speech(text, lang='en'):
    tts = gTTS(text=text, lang=lang)
    tts.save("output.mp3")
    os.system("afplay output.mp3")

# Example usage
video_path = "/Users/ronny/Downloads/AI_Final/testvideo.mp4"
description = video_to_text(video_path)
print("Original Description:", description)

# Translate the description to Twi
translated_description = translate_text(description, target_lang='ha')
print("Translated Description:", translated_description)

# Convert the translated text to speech
text_to_speech(translated_description, lang='ha')


Original Description: There was 'a man is standing next to a car'. There was 'a man is walking across the street with a car'. There was 'a horse race is being watched by a camera'. There was 'a man is playing with a small dog'. There was 'a man in a white shirt and black pants'. There was 'a man in a white shirt and black pants is playing with a frc'. There was 'a woman in a white dress is playing golf'. There was 'a man in a white shirt and pants playing a game of cricket'.
Translated Description: Akwai 'wani mutum yana tsaye kusa da mota'.Akwai 'wani mutum yana tafiya a saman titi tare da mota'.An kalli 'tseren doki ta hanyar kamara'.Akwai 'wani mutum yana wasa da karamin kare'.Akwai 'wani mutum a cikin farin rigar da wando baki'.Akwai 'wani mutum a cikin farin riguna da wando baƙi suna wasa tare da FRC'.Akwai 'mace a cikin fararen fata tana wasa golf'.Akwai 'wani mutum a cikin farin rig rigar da wando suna wasa da wasan wasan kurket'.
