Using Whisper in a commercial SaaS product requires careful consideration of licensing and potential limitations. Here's what you should know:

Licensing Status:


Whisper is released under the MIT License
This is a permissive license that allows commercial use
You can use, modify, and distribute it in your SaaS product


Potential Concerns:


While the license is permissive, the model depends on PyTorch which has its own license and terms
Resource costs may scale significantly with usage
Running Whisper locally means you're responsible for model updates and maintenance
No official support from OpenAI for production deployments


Alternatives to Consider:


Cloud Speech-to-Text services:

AWS Transcribe
Google Cloud Speech-to-Text
Azure Speech Services


These services provide:

SLAs for uptime
Official support
Managed infrastructure
Regular updates
Better scalability


Recommendations:


For early stage/small scale: Whisper is fine
For scaling up consider:

Implementing a hybrid approach (Whisper + cloud backup)
Gradually transitioning to cloud services
Running internal benchmarks to compare accuracy and costs

The main risk isn't legal but rather technical and operational. As you scale, managing your own speech recognition infrastructure might become more complex and costly than using managed services.
Would you like me to elaborate on any of these points or discuss specific cloud alternatives in more detail?

In [None]:
import os
import shutil
import numpy as np
import torch
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from pydub.playback import play

# Step 1: Extract Audio from Video
def preprocess_video(video_path, output_dir="processed_video"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    audio_path = os.path.join(output_dir, "audio.wav")

    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path, codec="pcm_s16le")
    return audio_path

# Step 2: Transcribe Audio
def transcribe_audio(audio_path):
    from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC

    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

    audio = AudioSegment.from_file(audio_path).set_frame_rate(16000).set_channels(1)
    samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
    inputs = tokenizer(samples, return_tensors="pt", padding="longest")

    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    return transcription

# Step 3: Create Knowledge Base
def create_knowledge_base(transcription):
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

    def embed_text(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            embeddings = model.encoder(**inputs).last_hidden_state.mean(dim=1).squeeze()
        return embeddings.cpu().numpy()

    segments = transcription.split(". ")
    embeddings = [embed_text(segment) for segment in segments]

    dimension = embeddings[0].shape[0]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(np.array(embeddings, dtype=np.float32))

    return faiss_index, segments

# Step 4: Chatbot Response
def chatbot_response(query, faiss_index, segments, tokenizer, model):
    def embed_text(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            embeddings = model.encoder(**inputs).last_hidden_state.mean(dim=1).squeeze()
        return embeddings.cpu().numpy()

    query_embedding = np.array([embed_text(query)], dtype=np.float32)
    distances, indices = faiss_index.search(query_embedding, k=1)
    relevant_segment = segments[indices[0][0]]

    input_text = f"Question: {query} Context: {relevant_segment}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    outputs = model.generate(**inputs, max_length=50, num_beams=3, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Testing Pipeline
def test_pipeline(video_path, questions):
    output_dir = "processed_video"

    print("Preprocessing video...")
    audio_path = preprocess_video(video_path, output_dir)

    print("Transcribing audio...")
    transcription = transcribe_audio(audio_path)
    print(f"Transcription: {transcription[:200]}...")  # Debugging snippet

    print("Creating knowledge base...")
    faiss_index, segments = create_knowledge_base(transcription)
    print(f"Segments: {segments[:3]}")  # Debugging snippet

    print("Loading conversational model...")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

    print("Testing chatbot...")
    for question in questions:
        response = chatbot_response(question, faiss_index, segments, tokenizer, model)
        print(f"Q: {question}")
        print(f"A: {response}")

    shutil.rmtree(output_dir)

# Main Function
if __name__ == "__main__":
    sample_video = "/content/videoplayback.mp4"  # Replace with your video file
    sample_questions = [
        "What is the video about?",
        "Can you summarize the content?",
        "What are the main points discussed?",
        "What is she having for lunch?",
        "Where is she going for lunch?"
    ]

    test_pipeline(sample_video, sample_questions)


  if event.key is 'enter':



Preprocessing video...
MoviePy - Writing audio in processed_video/audio.wav




MoviePy - Done.
Transcribing audio...


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.



pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcription: I JUST GOT OUT OF A TWO HOUR LECTURE SO I'M HEADED TO MY SWARDY HOUSE TO GET SOME LUNCH AS WE KNOW I HAVE BEEN SICK SO I NEED TO MAKE SURE I'M EATING WITH MY MEDICINE I THINK TODAY IS MEATBALL SUVES B...
Creating knowledge base...
Segments: ["I JUST GOT OUT OF A TWO HOUR LECTURE SO I'M HEADED TO MY SWARDY HOUSE TO GET SOME LUNCH AS WE KNOW I HAVE BEEN SICK SO I NEED TO MAKE SURE I'M EATING WITH MY MEDICINE I THINK TODAY IS MEATBALL SUVES BUT IM MUSGOINTO GET NEATBALLS GOES OF MY THEAVER PARTS LET'S GO TOGETHER  ME MEDY DEDECORATIONS ARE GORGEOUS MY UNCLE IS IN MY RIGHT TO KNOW IT I HAVE SO MANY SOTS TO PICKER  CAN I JOIN YOU GUY THINK   MA LIITS WRIM LI LOOKING TAN MAIN JUST KINNYAN KEEP ALL ALLI UTTEY NEED TO GO GET A SOWED I  WE GOT NECOLS MOFFLES RINE AND WATER MIND TIS I MEDIAT ABOUT I YE  MINDIS TINKIN E ELA I THINK THAT'S RIGHT I JUST APPLIED FOR GRADUATION ALL DONE JA DEA I HAVE TO GO DO HOMEARCH SHAN AN I ARE WALKING BACK HOME NOW SO I'LL SEE AGUIS LATER BAGIES"]

In [None]:
!pip install --upgrade transformers



# **Working**

In [None]:
import os
import numpy as np
import torch
import json
import warnings
from typing import List, Dict, Union, Optional
from transformers import pipeline
import whisper
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from moviepy.editor import VideoFileClip
import logging
from datetime import datetime
import soundfile as sf
import librosa

class FixedVideoChatbot:
    def __init__(self, cache_dir: str = "video_cache", log_file: str = "chatbot.log"):
        self.setup_logging(log_file)
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

        try:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.logger.info(f"Using device: {self.device}")

            self.transcription_model = whisper.load_model("base", device=self.device)
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.qa_pipeline = pipeline(
                "question-answering",
                model="deepset/roberta-base-squad2",
                device=0 if torch.cuda.is_available() else -1
            )
            self.logger.info("Models loaded successfully")
        except Exception as e:
            self.logger.error(f"Error loading models: {str(e)}")
            raise

        self.cached_data = {}

    def setup_logging(self, log_file: str):
        self.logger = logging.getLogger('VideoChatbot')
        self.logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        self.logger.addHandler(file_handler)

    def extract_audio(self, video_path: str) -> np.ndarray:
        """Extract audio using librosa instead of moviepy."""
        try:
            # Extract audio using librosa
            self.logger.info(f"Extracting audio from {video_path}")
            audio_array, sr = librosa.load(video_path, sr=16000, mono=True)

            # Ensure audio is the correct format for whisper
            if not isinstance(audio_array, np.ndarray):
                raise ValueError("Audio extraction failed")

            if len(audio_array.shape) != 1:
                audio_array = audio_array.mean(axis=-1)

            return audio_array.astype(np.float32)

        except Exception as e:
            self.logger.error(f"Error in audio extraction: {str(e)}")
            try:
                # Fallback method using moviepy
                self.logger.info("Attempting fallback audio extraction")
                with VideoFileClip(video_path) as video:
                    if video.audio is None:
                        raise ValueError("Video has no audio track")
                    audio_array = video.audio.to_soundarray(fps=16000)
                    if len(audio_array.shape) > 1:
                        audio_array = audio_array.mean(axis=1)
                    return audio_array.astype(np.float32)
            except Exception as fallback_error:
                self.logger.error(f"Fallback audio extraction failed: {str(fallback_error)}")
                raise

    def process_chunks(self, audio_array: np.ndarray, chunk_duration: int = 30) -> List[Dict]:
        """Process audio in chunks to handle memory constraints."""
        chunk_size = chunk_duration * 16000  # assuming 16kHz sample rate
        chunks = []

        for i in range(0, len(audio_array), chunk_size):
            chunk = audio_array[i:i + chunk_size]
            if len(chunk) < 100:  # Skip very small chunks
                continue

            # Ensure chunk is 1D array
            if len(chunk.shape) > 1:
                chunk = chunk.mean(axis=-1)

            result = self.transcription_model.transcribe(chunk)

            # Adjust timestamps
            for segment in result["segments"]:
                segment["start"] += i / 16000  # Convert samples to seconds
                segment["end"] += i / 16000
                chunks.append(segment)

        return chunks

    def process_video(self, video_path: str) -> Dict:
        """Process video with improved error handling."""
        try:
            video_id = os.path.basename(video_path)
            cache_file = os.path.join(self.cache_dir, f"{video_id}_transcription.json")

            # Check cache
            if os.path.exists(cache_file):
                with open(cache_file, 'r') as f:
                    return json.load(f)

            # Extract and process audio
            audio_array = self.extract_audio(video_path)

            # Process in chunks
            transcription_segments = self.process_chunks(audio_array)

            # Combine results
            transcription_data = {
                "full_text": " ".join(seg["text"] for seg in transcription_segments),
                "segments": transcription_segments
            }

            # Cache results
            with open(cache_file, 'w') as f:
                json.dump(transcription_data, f)

            return transcription_data

        except Exception as e:
            self.logger.error(f"Error processing video {video_path}: {str(e)}")
            raise

    def find_relevant_context(self, query: str, transcription_data: Dict) -> str:
        """Find relevant context with improved error handling."""
        try:
            if not transcription_data["segments"]:
                return ""

            # Convert query to embedding
            query_embedding = self.embedding_model.encode([query])

            # Get segment texts and embeddings
            segment_texts = [s["text"] for s in transcription_data["segments"]]
            if not segment_texts:
                return ""

            segment_embeddings = self.embedding_model.encode(segment_texts)

            # Ensure proper shapes
            if len(query_embedding.shape) == 1:
                query_embedding = query_embedding.reshape(1, -1)
            if len(segment_embeddings.shape) == 1:
                segment_embeddings = segment_embeddings.reshape(1, -1)

            # Calculate similarities
            similarities = cosine_similarity(query_embedding, segment_embeddings)[0]

            # Get top segments
            top_k = min(3, len(similarities))
            top_indices = np.argsort(similarities)[-top_k:]

            # Combine context
            context = " ".join(segment_texts[i] for i in top_indices)
            return context

        except Exception as e:
            self.logger.error(f"Error finding context: {str(e)}")
            return ""

    def get_response(self, query: str, video_paths: Union[str, List[str]]) -> Dict:
        """Generate response with improved error handling."""
        try:
            if isinstance(video_paths, str):
                video_paths = [video_paths]

            all_contexts = []
            for video_path in video_paths:
                if video_path not in self.cached_data:
                    self.cached_data[video_path] = self.process_video(video_path)

                context = self.find_relevant_context(query, self.cached_data[video_path])
                if context:
                    all_contexts.append(context)

            if not all_contexts:
                return {
                    "answer": "No relevant information found in the videos.",
                    "confidence": 0.0,
                    "context": ""
                }

            combined_context = " ".join(all_contexts)

            # Generate answer
            answer = self.qa_pipeline(
                question=query,
                context=combined_context,
                max_answer_length=100
            )

            return {
                "answer": answer["answer"],
                "confidence": answer["score"],
                "context": combined_context
            }

        except Exception as e:
            self.logger.error(f"Error generating response: {str(e)}")
            return {
                "error": str(e),
                "answer": "Sorry, I encountered an error processing your question.",
                "confidence": 0.0
            }

def main():
    # Example usage
    chatbot = FixedVideoChatbot()

    video_path = "/content/videoplayback.mp4"  # Your video path

    questions = [
        "What is the main topic discussed in the video?",
        "What are the key points mentioned?",
        "Can you summarize the conclusions?",
        "what she is having in lunch?",
        "where she is going for lunch?"
    ]

    for question in questions:
        try:
            response = chatbot.get_response(question, video_path)
            print(f"\nQuestion: {question}")
            print(f"Answer: {response['answer']}")
            print(f"Confidence: {response['confidence']:.2f}")
        except Exception as e:
            print(f"Error processing question: {str(e)}")

if __name__ == "__main__":
    main()

INFO:VideoChatbot:Using device: cpu
  checkpoint = torch.load(fp, map_location=device)

Device set to use cpu
INFO:VideoChatbot:Models loaded successfully



Question: What is the main topic discussed in the video?
Answer: media
Confidence: 0.09

Question: What are the key points mentioned?
Answer: media stuff
Confidence: 0.02

Question: Can you summarize the conclusions?
Answer: Bye guys.  I have to go do homework.
Confidence: 0.00

Question: what she is having in lunch?
Answer: meatballs, baffle fries, and watermelon
Confidence: 0.33

Question: where she is going for lunch?
Answer: my swarty house
Confidence: 0.13


In [None]:
import os
import numpy as np
import torch
import json
import warnings
from typing import List, Dict, Union, Optional
from transformers import pipeline
import whisper
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from moviepy.editor import VideoFileClip
import logging
from datetime import datetime
import soundfile as sf
import librosa

class FixedVideoChatbot:
    def __init__(self, cache_dir: str = "video_cache", log_file: str = "chatbot.log"):
        self.setup_logging(log_file)
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

        try:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.logger.info(f"Using device: {self.device}")

            self.transcription_model = whisper.load_model("base", device=self.device)
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.qa_pipeline = pipeline(
                "question-answering",
                model="deepset/roberta-base-squad2",
                device=0 if torch.cuda.is_available() else -1
            )
            self.logger.info("Models loaded successfully")
        except Exception as e:
            self.logger.error(f"Error loading models: {str(e)}")
            raise

        self.cached_data = {}

    def setup_logging(self, log_file: str):
        self.logger = logging.getLogger('VideoChatbot')
        self.logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        self.logger.addHandler(file_handler)

    def extract_audio(self, video_path: str) -> np.ndarray:
        """Extract audio using librosa instead of moviepy."""
        try:
            # Extract audio using librosa
            self.logger.info(f"Extracting audio from {video_path}")
            audio_array, sr = librosa.load(video_path, sr=16000, mono=True)

            # Ensure audio is the correct format for whisper
            if not isinstance(audio_array, np.ndarray):
                raise ValueError("Audio extraction failed")

            if len(audio_array.shape) != 1:
                audio_array = audio_array.mean(axis=-1)

            return audio_array.astype(np.float32)

        except Exception as e:
            self.logger.error(f"Error in audio extraction: {str(e)}")
            try:
                # Fallback method using moviepy
                self.logger.info("Attempting fallback audio extraction")
                with VideoFileClip(video_path) as video:
                    if video.audio is None:
                        raise ValueError("Video has no audio track")
                    audio_array = video.audio.to_soundarray(fps=16000)
                    if len(audio_array.shape) > 1:
                        audio_array = audio_array.mean(axis=1)
                    return audio_array.astype(np.float32)
            except Exception as fallback_error:
                self.logger.error(f"Fallback audio extraction failed: {str(fallback_error)}")
                raise

    def process_chunks(self, audio_array: np.ndarray, chunk_duration: int = 30) -> List[Dict]:
        """Process audio in chunks to handle memory constraints."""
        chunk_size = chunk_duration * 16000  # assuming 16kHz sample rate
        chunks = []

        for i in range(0, len(audio_array), chunk_size):
            chunk = audio_array[i:i + chunk_size]
            if len(chunk) < 100:  # Skip very small chunks
                continue

            # Ensure chunk is 1D array
            if len(chunk.shape) > 1:
                chunk = chunk.mean(axis=-1)

            result = self.transcription_model.transcribe(chunk)

            # Adjust timestamps
            for segment in result["segments"]:
                segment["start"] += i / 16000  # Convert samples to seconds
                segment["end"] += i / 16000
                chunks.append(segment)

        return chunks

    def process_video(self, video_path: str) -> Dict:
        """Process video with improved error handling."""
        try:
            video_id = os.path.basename(video_path)
            cache_file = os.path.join(self.cache_dir, f"{video_id}_transcription.json")

            # Check cache
            if os.path.exists(cache_file):
                with open(cache_file, 'r') as f:
                    return json.load(f)

            # Extract and process audio
            audio_array = self.extract_audio(video_path)

            # Process in chunks
            transcription_segments = self.process_chunks(audio_array)

            # Combine results
            transcription_data = {
                "full_text": " ".join(seg["text"] for seg in transcription_segments),
                "segments": transcription_segments
            }

            # Cache results
            with open(cache_file, 'w') as f:
                json.dump(transcription_data, f)

            return transcription_data

        except Exception as e:
            self.logger.error(f"Error processing video {video_path}: {str(e)}")
            raise

    def find_relevant_context(self, query: str, transcription_data: Dict) -> str:
        """Find relevant context with improved error handling."""
        try:
            if not transcription_data["segments"]:
                return ""

            # Convert query to embedding
            query_embedding = self.embedding_model.encode([query])

            # Get segment texts and embeddings
            segment_texts = [s["text"] for s in transcription_data["segments"]]
            if not segment_texts:
                return ""

            segment_embeddings = self.embedding_model.encode(segment_texts)

            # Ensure proper shapes
            if len(query_embedding.shape) == 1:
                query_embedding = query_embedding.reshape(1, -1)
            if len(segment_embeddings.shape) == 1:
                segment_embeddings = segment_embeddings.reshape(1, -1)

            # Calculate similarities
            similarities = cosine_similarity(query_embedding, segment_embeddings)[0]

            # Get top segments
            top_k = min(3, len(similarities))
            top_indices = np.argsort(similarities)[-top_k:]

            # Combine context
            context = " ".join(segment_texts[i] for i in top_indices)
            return context

        except Exception as e:
            self.logger.error(f"Error finding context: {str(e)}")
            return ""

    def get_response(self, query: str, video_paths: Union[str, List[str]]) -> Dict:
        """Generate response with improved error handling."""
        try:
            if isinstance(video_paths, str):
                video_paths = [video_paths]

            all_contexts = []
            for video_path in video_paths:
                if video_path not in self.cached_data:
                    self.cached_data[video_path] = self.process_video(video_path)

                context = self.find_relevant_context(query, self.cached_data[video_path])
                if context:
                    all_contexts.append(context)

            if not all_contexts:
                return {
                    "answer": "No relevant information found in the videos.",
                    "confidence": 0.0,
                    "context": ""
                }

            combined_context = " ".join(all_contexts)

            # Generate answer
            answer = self.qa_pipeline(
                question=query,
                context=combined_context,
                max_answer_length=100
            )

            return {
                "answer": answer["answer"],
                "confidence": answer["score"],
                "context": combined_context
            }

        except Exception as e:
            self.logger.error(f"Error generating response: {str(e)}")
            return {
                "error": str(e),
                "answer": "Sorry, I encountered an error processing your question.",
                "confidence": 0.0
            }

def main():
    # Example usage
    chatbot = FixedVideoChatbot()

    video_path = "/content/test.mp4"  # Your video path

    questions = [
        "What is the main topic discussed in the video?",
        "What are the key points mentioned?",
        "who is she?",
        "what she is cooking?",
        "why she is cooking?"
    ]

    for question in questions:
        try:
            response = chatbot.get_response(question, video_path)
            print(f"\nQuestion: {question}")
            print(f"Answer: {response['answer']}")
            print(f"Confidence: {response['confidence']:.2f}")
        except Exception as e:
            print(f"Error processing question: {str(e)}")

if __name__ == "__main__":
    main()

INFO:VideoChatbot:Using device: cpu
  checkpoint = torch.load(fp, map_location=device)

Device set to use cpu
INFO:VideoChatbot:Models loaded successfully
INFO:VideoChatbot:Extracting audio from /content/test.mp4
  audio_array, sr = librosa.load(video_path, sr=16000, mono=True)

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)





Question: What is the main topic discussed in the video?
Answer: green flags
Confidence: 0.00

Question: What are the key points mentioned?
Answer: respectful, funny, positive, generous, emotionally healthy, and hot
Confidence: 0.25

Question: who is she?
Answer: single life
Confidence: 0.04

Question: what she is cooking?
Answer: Ubucho about with four different toppings for a picnic
Confidence: 0.03

Question: why she is cooking?
Answer: He made me want to cook for him
Confidence: 0.00


In [2]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-mw_ypv6q
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-mw_ypv6q
  Resolved https://github.com/openai/whisper.git to commit 90db0de1896c23cbfaf0c58bc2d30665f709f170
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20240930)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

Tried image/video but not working

In [4]:
import os
import numpy as np
import torch
import json
from typing import List, Dict, Union, Tuple
from transformers import pipeline
import whisper
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import librosa
import logging
import cv2
from PIL import Image

class EnhancedVideoChatbot:
    def __init__(self, cache_dir: str = "video_cache", log_file: str = "chatbot.log"):
        self.setup_logging(log_file)
        self.cache_dir = cache_dir
        self.frame_cache_dir = os.path.join(cache_dir, "frames")
        os.makedirs(self.cache_dir, exist_ok=True)
        os.makedirs(self.frame_cache_dir, exist_ok=True)

        try:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.logger.info(f"Using device: {self.device}")

            # Initialize models
            self.transcription_model = whisper.load_model("base", device=self.device)
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.qa_pipeline = pipeline(
                "question-answering",
                model="deepset/roberta-base-squad2",
                device=0 if torch.cuda.is_available() else -1
            )

            self.logger.info("Models loaded successfully")
        except Exception as e:
            self.logger.error(f"Error loading models: {str(e)}")
            raise

        self.cached_data = {}
        self.video_metadata = {}

    def setup_logging(self, log_file: str):
        self.logger = logging.getLogger('VideoChatbot')
        self.logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        self.logger.addHandler(file_handler)

    def extract_frames(self, video_path: str, segment_timestamps: List[Dict]) -> Dict[float, str]:
        """Extract and save frames at relevant timestamps."""
        try:
            video_id = os.path.basename(video_path)
            cap = cv2.VideoCapture(video_path)
            fps = cap.get(cv2.CAP_PROP_FPS)
            frame_dict = {}

            for segment in segment_timestamps:
                start_time = segment["start"]
                frame_number = int(start_time * fps)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
                ret, frame = cap.read()

                if ret:
                    frame_path = os.path.join(
                        self.frame_cache_dir,
                        f"{video_id}_frame_{start_time:.2f}.jpg"
                    )
                    cv2.imwrite(frame_path, frame)
                    frame_dict[start_time] = frame_path

            cap.release()
            return frame_dict

        except Exception as e:
            self.logger.error(f"Error extracting frames: {str(e)}")
            return {}

    def process_video(self, video_path: str) -> Dict:
        try:
            video_id = os.path.basename(video_path)
            cache_file = os.path.join(self.cache_dir, f"{video_id}_transcription.json")

            if os.path.exists(cache_file):
                with open(cache_file, 'r') as f:
                    processed_data = json.load(f)
                    if video_id not in self.video_metadata:
                        self.video_metadata[video_id] = {
                            "path": video_path,
                            "frames": processed_data.get("frames", {})
                        }
                    return processed_data

            # Extract audio and transcribe
            audio_array = self.extract_audio(video_path)
            transcription = {"text": "", "segments": []}

            # Process in chunks
            max_duration = 25 * 16000  # 25 seconds chunks
            for i in range(0, len(audio_array), max_duration):
                chunk = audio_array[i:min(i + max_duration, len(audio_array))]
                result = self.transcription_model.transcribe(chunk)

                for segment in result["segments"]:
                    segment["start"] += i / 16000
                    segment["end"] += i / 16000
                    transcription["segments"].append(segment)
                transcription["text"] += " " + result["text"]

            # Merge segments and extract frames
            merged_segments = []
            current_segment = {"text": "", "start": 0, "end": 0}

            for segment in transcription["segments"]:
                if len(current_segment["text"].split()) < 15:
                    current_segment["text"] += " " + segment["text"]
                    current_segment["end"] = segment["end"]
                else:
                    if current_segment["text"]:
                        merged_segments.append(current_segment.copy())
                    current_segment = segment

            if current_segment["text"]:
                merged_segments.append(current_segment)

            # Extract frames for each segment
            frame_dict = self.extract_frames(video_path, merged_segments)

            transcription_data = {
                "full_text": transcription["text"].strip(),
                "segments": merged_segments,
                "frames": frame_dict
            }

            with open(cache_file, 'w') as f:
                json.dump(transcription_data, f)

            self.video_metadata[video_id] = {
                "path": video_path,
                "frames": frame_dict
            }

            return transcription_data

        except Exception as e:
            self.logger.error(f"Error processing video {video_path}: {str(e)}")
            raise

    def extract_audio(self, video_path: str) -> np.ndarray:
        try:
            audio_array, sr = librosa.load(video_path, sr=16000, mono=True)
            return audio_array.astype(np.float32)
        except Exception as e:
            self.logger.error(f"Error in audio extraction: {str(e)}")
            raise

    def get_relevant_segments(self, query: str, transcription_data: Dict) -> List[Dict]:
        try:
            if not transcription_data["segments"]:
                return []

            query_embedding = self.embedding_model.encode([query], convert_to_tensor=True)
            segment_texts = [s["text"] for s in transcription_data["segments"]]
            segment_embeddings = self.embedding_model.encode(segment_texts, convert_to_tensor=True)

            similarities = cosine_similarity(
                query_embedding.cpu().numpy(),
                segment_embeddings.cpu().numpy()
            )[0]

            # Get top segments with their similarity scores
            top_k = min(3, len(similarities))
            top_indices = np.argsort(similarities)[-top_k:]

            relevant_segments = []
            for idx in top_indices:
                segment = transcription_data["segments"][idx].copy()
                segment["similarity"] = float(similarities[idx])
                relevant_segments.append(segment)

            return sorted(relevant_segments, key=lambda x: x["similarity"], reverse=True)

        except Exception as e:
            self.logger.error(f"Error finding relevant segments: {str(e)}")
            return []

    def get_response(self, query: str, video_paths: Union[str, List[str]]) -> Dict:
        try:
            if isinstance(video_paths, str):
                video_paths = [video_paths]

            all_responses = []

            for video_path in video_paths:
                video_id = os.path.basename(video_path)

                if video_path not in self.cached_data:
                    self.cached_data[video_path] = self.process_video(video_path)

                # Get relevant segments with timestamps
                relevant_segments = self.get_relevant_segments(
                    query,
                    self.cached_data[video_path]
                )

                if not relevant_segments:
                    continue

                # Get frames for relevant segments
                frames = self.cached_data[video_path].get("frames", {})

                for segment in relevant_segments:
                    if segment["similarity"] < 0.3:  # Similarity threshold
                        continue

                    start_time = segment["start"]
                    frame_path = None

                    # Find the closest frame
                    if frames:
                        closest_time = min(frames.keys(), key=lambda x: abs(float(x) - start_time))
                        frame_path = frames[closest_time]

                    response = {
                        "video_id": video_id,
                        "video_path": video_path,
                        "segment_text": segment["text"],
                        "start_time": start_time,
                        "end_time": segment["end"],
                        "confidence": segment["similarity"],
                        "frame_path": frame_path
                    }

                    all_responses.append(response)

            if not all_responses:
                return {
                    "answer": "No relevant information found in the videos.",
                    "confidence": 0.0,
                    "segments": []
                }

            # Sort responses by confidence
            all_responses.sort(key=lambda x: x["confidence"], reverse=True)

            # Get the best answer using QA pipeline
            best_context = " ".join([r["segment_text"] for r in all_responses[:2]])
            qa_result = self.qa_pipeline(
                question=query,
                context=best_context,
                max_answer_length=100,
                handle_impossible_answer=True
            )

            return {
                "answer": qa_result["answer"],
                "confidence": qa_result["score"],
                "segments": all_responses  # Include all relevant segments with metadata
            }

        except Exception as e:
            self.logger.error(f"Error generating response: {str(e)}")
            return {
                "error": str(e),
                "answer": "Sorry, I encountered an error processing your question.",
                "confidence": 0.0,
                "segments": []
            }

def main():
    chatbot = EnhancedVideoChatbot()
    video_paths = ["/content/test.mp4", "/content/videoplayback.mp4"]  # Example with multiple videos

    questions = [
        "What is the main topic discussed in the video?",
        "What are the key points mentioned?",
        "who is she?",
        "what she is making?",
        "why she is cooking?",
        "how swarty house looks like?"
    ]

    print("Processing videos and generating responses...")
    for question in questions:
        try:
            response = chatbot.get_response(question, video_paths)
            print(f"\nQuestion: {question}")
            print(f"Answer: {response['answer']}")
            print(f"Confidence: {response['confidence']:.2f}")

            print("\nRelevant video segments:")
            for segment in response["segments"]:
                print(f"\nVideo: {segment['video_id']}")
                print(f"Timestamp: {segment['start_time']:.2f}s - {segment['end_time']:.2f}s")
                print(f"Text: {segment['segment_text']}")
                if segment['frame_path']:
                    print(f"Frame captured at: {segment['frame_path']}")
                print(f"Confidence: {segment['confidence']:.2f}")

        except Exception as e:
            print(f"Error processing question: {str(e)}")

if __name__ == "__main__":
    main()

INFO:VideoChatbot:Using device: cpu
Device set to use cpu
INFO:VideoChatbot:Models loaded successfully


Processing videos and generating responses...


  audio_array, sr = librosa.load(video_path, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio_array, sr = librosa.load(video_path, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



Question: What is the main topic discussed in the video?
Answer: No relevant information found in the videos.
Confidence: 0.00

Relevant video segments:

Question: What are the key points mentioned?
Answer: No relevant information found in the videos.
Confidence: 0.00

Relevant video segments:

Question: who is she?
Answer: No relevant information found in the videos.
Confidence: 0.00

Relevant video segments:

Question: what she is making?
Answer: No relevant information found in the videos.
Confidence: 0.00

Relevant video segments:

Question: why she is cooking?
Answer: packing us new butchle about with four different toppings for a picnic
Confidence: 0.07

Relevant video segments:

Video: test.mp4
Timestamp: 0.00s - 3.76s
Text:   Hey, remember I got the ick from the guy who asked me to cook on her date?
Frame captured at: video_cache/frames/test.mp4_frame_0.00.jpg
Confidence: 0.47

Video: test.mp4
Timestamp: 7.16s - 14.40s
Text:  He made me want to cook for him.  And when I say co

In [14]:
import os
import numpy as np
import torch
import json
from typing import List, Dict
from transformers import pipeline
import whisper
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import cv2
from moviepy.editor import VideoFileClip

class VideoQA:
    def __init__(self, output_dir: str = "outputs"):
        # Create output directories
        self.output_dir = output_dir
        self.clips_dir = os.path.join(output_dir, "clips")
        self.frames_dir = os.path.join(output_dir, "frames")
        os.makedirs(self.clips_dir, exist_ok=True)
        os.makedirs(self.frames_dir, exist_ok=True)

        # Initialize models
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.transcriber = whisper.load_model("base", device=self.device)
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0 if self.device=="cuda" else -1)

        self.video_data = {}

    def process_video(self, video_path: str) -> None:
        """Process video and store transcription with timestamps"""
        video_id = os.path.basename(video_path)

        if video_id in self.video_data:
            return

        # Transcribe video
        result = self.transcriber.transcribe(video_path)

        # Store segments with timestamps
        self.video_data[video_id] = {
            "path": video_path,
            "segments": result["segments"],
            "full_text": result["text"]
        }

    def extract_clip(self, video_path: str, start_time: float, end_time: float) -> Dict:
        """Extract clip and frame from video"""
        video_id = os.path.basename(video_path)

        # Add small padding to clip
        start_time = max(0, start_time - 1)
        duration = end_time - start_time + 2

        # Extract clip
        clip_path = os.path.join(self.clips_dir, f"{video_id}_{start_time:.1f}_{end_time:.1f}.mp4")
        if not os.path.exists(clip_path):
            with VideoFileClip(video_path) as video:
                clip = video.subclip(start_time, end_time)
                clip.write_videofile(clip_path, codec='libx264', audio_codec='aac')

        # Extract middle frame
        mid_time = (start_time + end_time) / 2
        frame_path = os.path.join(self.frames_dir, f"{video_id}_{mid_time:.1f}.jpg")
        if not os.path.exists(frame_path):
            cap = cv2.VideoCapture(video_path)
            cap.set(cv2.CAP_PROP_POS_MSEC, mid_time * 1000)
            ret, frame = cap.read()
            if ret:
                cv2.imwrite(frame_path, frame)
            cap.release()

        return {
            "clip_path": clip_path,
            "frame_path": frame_path,
            "start": start_time,
            "end": end_time
        }

    def find_answer(self, question: str, video_paths: List[str]) -> Dict:
        """Find answer in videos with relevant clips/frames"""
        all_answers = []

        for video_path in video_paths:
            video_id = os.path.basename(video_path)

            # Process video if needed
            self.process_video(video_path)
            segments = self.video_data[video_id]["segments"]

            # Get question embedding
            q_embedding = self.embedder.encode(question, convert_to_tensor=True)

            # Find relevant segments
            relevant_segments = []
            for seg in segments:
                text_embedding = self.embedder.encode(seg["text"], convert_to_tensor=True)
                similarity = cosine_similarity(
                    q_embedding.cpu().numpy().reshape(1, -1),
                    text_embedding.cpu().numpy().reshape(1, -1)
                )[0][0]

                if similarity > 0.3:  # Similarity threshold
                    seg["similarity"] = similarity
                    relevant_segments.append(seg)

            if not relevant_segments:
                continue

            # Sort by similarity
            relevant_segments.sort(key=lambda x: x["similarity"], reverse=True)

            # Get best segments
            context = " ".join(seg["text"] for seg in relevant_segments[:2])

            # Get answer
            answer = self.qa_model(question=question, context=context)

            if answer["score"] > 0.1:  # Confidence threshold
                # Extract video clip for best matching segment
                best_segment = relevant_segments[0]
                media = self.extract_clip(
                    video_path,
                    best_segment["start"],
                    best_segment["end"]
                )

                all_answers.append({
                    "video_id": video_id,
                    "answer": answer["answer"],
                    "confidence": answer["score"],
                    "clip_path": media["clip_path"],
                    "frame_path": media["frame_path"],
                    "start_time": media["start"],
                    "end_time": media["end"],
                    "context": best_segment["text"]
                })

        if not all_answers:
            return {"error": "No relevant answer found in videos"}

        # Sort by confidence
        all_answers.sort(key=lambda x: x["confidence"], reverse=True)

        return {
            "answers": all_answers,
            "best_answer": {
                "text": " | ".join(f"From {a['video_id']}: {a['answer']}" for a in all_answers),
                "confidence": max(a["confidence"] for a in all_answers)
            }
        }

def main():
    # Example usage
    qa_system = VideoQA()
    videos = ["/content/test.mp4", "/content/videoplayback.mp4"]

    questions = [
        "What is the main topic discussed in the video?",
        "What are the key points mentioned?",
        "who is she?",
        "what she is making?",
        "why she is cooking?",
        "how swarty house looks like?"
    ]
    # Ensure consistent indentation for the loop
    for question in questions:
        result = qa_system.find_answer(question, videos)

        print(f"Question: {question}")
        #print(f"\nBest Answer: {result['best_answer']['text']}")
        print(f"Confidence: {result['answer']['confidence']:.2f}")

        print("\nDetailed Answers:")
        for ans in result["answers"]:
            print(f"\nFrom Video: {ans['video_id']}")
            print(f"Answer: {ans['answer']}")
            print(f"Timestamp: {ans['start_time']:.1f}s to {ans['end_time']:.1f}s")
            print(f"Clip saved at: {ans['clip_path']}")
            print(f"Frame saved at: {ans['frame_path']}")
            print(f"Context: {ans['context']}")

if __name__ == "__main__":
    main()

  checkpoint = torch.load(fp, map_location=device)

Device set to use cpu




Question: What is the main topic discussed in the video?


KeyError: 'answer'