In [None]:
import os
import time
import torch
from moviepy.editor import VideoFileClip
from transformers import (DistilBertTokenizer, DistilBertForQuestionAnswering,
                          MarianMTModel, MarianTokenizer, pipeline)
import whisper
from gtts import gTTS
import pygame
import cv2

# Function to extract audio from a video file
def extract_audio_from_video(video_path, output_audio_path):
    """Extract audio from the specified video file and save it as a .wav file."""
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')  # Use a lossless codec

# Function to transcribe audio using Whisper
def transcribe_audio(audio_path):
    """Transcribe audio to text using Whisper ASR."""
    model = whisper.load_model("base")  # Use the 'base' model for a balance of speed and accuracy
    result = model.transcribe(audio_path)
    return result['text']

# Function to summarize the transcription using a summarization pipeline
def summarize_transcription(transcription):
    """Summarize the transcription using a BART summarization model."""
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    # Split the transcription into chunks if it's too long
    max_length = 1024  # BART's maximum token limit
    inputs = [transcription[i:i + max_length] for i in range(0, len(transcription), max_length)]
    
    summary = []
    for input_text in inputs:
        summary_chunk = summarizer(input_text, max_length=50, min_length=20, do_sample=False)
        summary.append(summary_chunk[0]['summary_text'])
    
    return " ".join(summary)

# Function to initialize the DistilBERT model and tokenizer for question answering
def init_qa_model():
    """Initialize the DistilBERT model and tokenizer for question answering."""
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
    model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
    return tokenizer, model

# Function to answer questions based on context using DistilBERT
def answer_question(question, context, tokenizer, model):
    """Answer a question based on the provided context."""
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits) + 1
    answer_tokens = input_ids[0][start_index:end_index]
    answer = tokenizer.decode(answer_tokens)

    return answer.strip()  # Return trimmed answer

# Function to load translation model and tokenizer
def load_translation_model(source_lang, target_lang):
    """Load translation model and tokenizer."""
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return model, tokenizer

# Function to translate text using MarianMT
def translate_text(text, model, tokenizer):
    """Translate text using the MarianMT model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text.strip()  # Return trimmed translation

# Function for text-to-speech
def text_to_speech(text, language_code='en'):
    """Convert text to speech and play it."""
    if text:
        audio_file = f"answer_{int(time.time())}.mp3"
        tts = gTTS(text, lang=language_code)
        tts.save(audio_file)
        pygame.mixer.init()
        pygame.mixer.music.load(audio_file)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)
        pygame.mixer.music.stop()
        pygame.mixer.quit()
        os.remove(audio_file)  # Clean up the audio file

# Function to load the YOLOv5 model
def load_yolov5_model():
    """Load YOLOv5 object detection model."""
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # Use the small model for speed
    return model

# Function to perform object detection on video frames and collect detected objects
def detect_objects_in_video(video_path, model):
    """Detect objects in the video and return a list of detected objects per frame."""
    detected_objects = []
    cap = cv2.VideoCapture(video_path)
    frame_skip = 5  # Process every 5th frame to improve speed

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Perform inference only on every nth frame
        if cap.get(cv2.CAP_PROP_POS_FRAMES) % frame_skip == 0:
            results = model(frame)
            frame_objects = [results.names[int(cls)] for *box, conf, cls in results.xyxy[0]]  # List of detected object names
            detected_objects.append(frame_objects)

            # Optionally render results on the frame
            frame = results.render()[0]
            cv2.imshow('YOLOv5 Detection', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to exit
            break

    cap.release()
    cv2.destroyAllWindows()
    return detected_objects

# Function to create a meaningful scene description
def create_scene_description(detected_objects, transcription):
    """Create a description of the scene based on detected objects and transcription."""
    object_count = {}
    
    for objects in detected_objects:
        for obj in objects:
            object_count[obj] = object_count.get(obj, 0) + 1

    most_common_objects = [obj for obj, count in object_count.items() if count > 1]
    scene_summary = []

    if most_common_objects:
        scene_summary.append(f"In this video, you see {' and '.join(most_common_objects)}.")

    # Combine with transcription for context
    full_description = " ".join(scene_summary)
    full_description += f" The narration says: {transcription}"
    return full_description.strip()  # Return trimmed description

# Main function to run the entire process
def main(video_path):
    """Main function to execute video processing and user interaction."""
    # Step 1: Extract audio from the video
    audio_path = "extracted_audio.wav"
    extract_audio_from_video(video_path, audio_path)
    print("Audio extracted successfully.")

    # Step 2: Transcribe audio to text
    transcription = transcribe_audio(audio_path)
    print("Transcription:", transcription)

    # Step 3: Summarize transcription
    transcription_summary = summarize_transcription(transcription)
    print("Transcription Summary:", transcription_summary)

    # Step 4: Load the YOLOv5 model and detect objects in the video
    yolov5_model = load_yolov5_model()
    detected_objects = detect_objects_in_video(video_path, yolov5_model)

    # Step 5: Create scene descriptions
    scene_description = create_scene_description(detected_objects, transcription)
    print("Scene Description:", scene_description)

    # Convert the scene description to speech
    text_to_speech(scene_description)

    # Initialize the question-answering model
    tokenizer, model = init_qa_model()

    # Step 6: Loop for answering questions
    while True:
        question = input("Ask a question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        answer = answer_question(question, transcription, tokenizer, model)
        print("Answer:", answer)

        # Step 7: Translate answer if desired
        translate = input("Do you want to translate the answer? (yes/no): ").lower()
        if translate == 'yes':
            target_lang = input("Enter target language code (e.g., 'fr' for French, 'es' for Spanish): ").lower()
            translation_model, translation_tokenizer = load_translation_model('en', target_lang)
            translated_answer = translate_text(answer, translation_model, translation_tokenizer)
            print(f"Translated Answer ({target_lang}):", translated_answer)
            text_to_speech(translated_answer, target_lang)
        else:
            text_to_speech(answer)

# Run the main function with your video file path
if __name__ == "__main__":
    video_file_path = "44.mp4"  # Replace with your video file path
    main(video_file_path)


MoviePy - Writing audio in extracted_audio.wav


                                                                                                                       

MoviePy - Done.
Audio extracted successfully.
Transcription:  This is a Formula 1 car. This is especially a Mercedes-Benz petrolniss F1 car. This is Louis Hamilton's car. Its number is 34. He is a 7-time Volvo champion and he is awesome.


Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Transcription Summary: This is a Formula 1 car. This is especially a Mercedes-Benz petrolniss F1 car. Louis Hamilton is a 7-time Volvo champion.


Using cache found in C:\Users\nkeer/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-12 Python-3.12.4 torch-2.4.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Scene Description: In this video, you see motorcycle and car. The narration says:  This is a Formula 1 car. This is especially a Mercedes-Benz petrolniss F1 car. This is Louis Hamilton's car. Its number is 34. He is a 7-time Volvo champion and he is awesome.
