In [1]:
!pip install ffmpeg-python openai-whisper sentence-transformers chromadb tiktoken requests


Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
     ---------------------------------------- 0.0/803.2 kB ? eta -:--:--
     ---------------------------------------- 0.0/803.2 kB ? eta -:--:--
     ------------- -------------------------- 262.1/803.2 kB ? eta -:--:--
     -------------------------------------- 803.2/803.2 kB 1.7 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml): started
  Building wheel for openai-whisper (pyproject.toml): finished with status 'done'
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import ffmpeg
import os

def extract_audio(video_path, output_audio_path="audio.wav"):
    (
        ffmpeg
        .input(video_path)
        .output(output_audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000')
        .overwrite_output()
        .run(quiet=True)
    )
    return output_audio_path


In [3]:
video_path=r"C:\Users\harsh\Videos\Scam 2003\1.mkv"

In [5]:
result=extract_audio(video_path)

In [6]:
result

'audio.wav'

In [7]:
import whisper

model = whisper.load_model("base")  # Use "medium" or "large" for better quality

def transcribe_audio(audio_path):
    result = model.transcribe(audio_path, verbose=False)
    return result['segments']  # List of segments with start, end, and text


100%|████████████████████████████████████████| 139M/139M [03:05<00:00, 784kiB/s]


In [9]:
audio_path=result

In [10]:
result=transcribe_audio(audio_path)

Detected language: Hindi


 98%|███████████████████████████████████████████████████████████████████▋ | 296711/302711 [10:38<00:12, 464.68frames/s]


In [11]:
result

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 3.48,
  'text': ' ए क्द�ル अपे गी™ spice',
  'tokens': [50364,
   8485,
   237,
   31970,
   27099,
   3941,
   99,
   38518,
   8485,
   227,
   3941,
   103,
   21981,
   8485,
   245,
   31881,
   34933,
   19436,
   50538],
  'temperature': 1.0,
  'avg_logprob': -2.958089321474486,
  'compression_ratio': 1.1526717557251909,
  'no_speech_prob': 0.21818822622299194},
 {'id': 1,
  'seek': 0,
  'start': 20.0,
  'end': 21.48,
  'text': ' Neo Þip Claes',
  'tokens': [51364, 24458, 690, 252, 647, 12947, 279, 51438],
  'temperature': 1.0,
  'avg_logprob': -2.958089321474486,
  'compression_ratio': 1.1526717557251909,
  'no_speech_prob': 0.21818822622299194},
 {'id': 2,
  'seek': 0,
  'start': 23.0,
  'end': 24.88,
  'text': ' आप बकती़ कहुणे',
  'tokens': [51514,
   8485,
   228,
   3941,
   103,
   8485,
   105,
   41858,
   36158,
   31881,
   3941,
   120,
   31970,
   44500,
   8703,
   223,
   3941,
   96,
   21981,
   51608],
  'temperat

In [12]:
def chunk_transcript_with_timestamps(segments, max_chunk_words=100):
    chunks, current_chunk, current_start = [], [], None

    for seg in segments:
        words = seg['text'].strip().split()
        if not current_chunk:
            current_start = seg['start']

        current_chunk.extend(words)

        if len(current_chunk) >= max_chunk_words:
            chunks.append({
                "text": " ".join(current_chunk),
                "start": current_start,
                "end": seg['end']
            })
            current_chunk = []

    if current_chunk:
        chunks.append({
            "text": " ".join(current_chunk),
            "start": current_start,
            "end": segments[-1]['end']
        })

    return chunks


In [16]:
from sentence_transformers import SentenceTransformer
import chromadb

model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks_and_store(chunks):
    client = chromadb.Client()
    collection = client.create_collection("lecture")

    for i, chunk in enumerate(chunks):
        emb = model.encode(chunk["text"])
        collection.add(
            documents=[chunk["text"]],
            ids=[f"chunk_{i}"],
            embeddings=[emb.tolist()],
            metadatas=[{
                "start": chunk["start"],
                "end": chunk["end"]
            }]
        )
    return collection


In [17]:
def retrieve_relevant_chunks(query, collection, top_k=3):
    query_emb = model.encode([query])[0]
    result = collection.query(
        query_embeddings=[query_emb.tolist()],
        n_results=top_k
    )
    docs = result['documents'][0]
    meta = result['metadatas'][0]
    return [{"text": t, "start": m['start'], "end": m['end']} for t, m in zip(docs, meta)]


In [18]:
import requests
import os
from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

def query_groq_llama(query, context_chunks):
    context = "\n\n".join([c['text'] for c in context_chunks])
    timestamps = "\n".join([f"- [{c['start']:.2f} → {c['end']:.2f}]" for c in context_chunks])

    messages = [
        {"role": "system", "content": "You are a helpful AI lecture assistant."},
        {"role": "user", "content": f"""Lecture Context:
{context}

Timestamps:
{timestamps}

Question: {query}
Answer with references to timestamps.
"""}
    ]

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {GROQ_API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": "llama3-70b-8192",
            "messages": messages,
            "temperature": 0.3
        }
    )

    return response.json()['choices'][0]['message']['content']


In [19]:
segments=result

In [24]:
segments[0]

{'id': 0,
 'seek': 0,
 'start': 0.0,
 'end': 3.48,
 'text': ' ए क्द�ル अपे गी™ spice',
 'tokens': [50364,
  8485,
  237,
  31970,
  27099,
  3941,
  99,
  38518,
  8485,
  227,
  3941,
  103,
  21981,
  8485,
  245,
  31881,
  34933,
  19436,
  50538],
 'temperature': 1.0,
 'avg_logprob': -2.958089321474486,
 'compression_ratio': 1.1526717557251909,
 'no_speech_prob': 0.21818822622299194}

In [22]:
# Chunk with timestamps
chunks = chunk_transcript_with_timestamps(segments)

# Embed & store in vector DB
collection = embed_chunks_and_store(chunks)

In [25]:




# Query
question = "What is the language in this video?"
top_chunks = retrieve_relevant_chunks(question, collection)

# Get final response with Groq LLaMA
response = query_groq_llama(question, top_chunks)
print(response)


What a fascinating lecture!

After analyzing the provided text, I can confidently say that the language spoken in this video is a mix of Hindi and Urdu, with some English words and phrases thrown in.

Here are some references to timestamps that support my answer:

* [778.90 → 832.08]: The speaker uses Hindi phrases like "देवेस्वां दलिं हीं" and "यह शे यैं ऻी ब". The use of Devanagari script and Hindi vocabulary suggests that the primary language being spoken is Hindi.
* [2563.74 → 2655.74]: The speaker mentions "Urdu Poitri" and "Ali Kati University", which indicates that Urdu is also being spoken or referenced in the conversation.
* [916.98 → 993.78]: The speaker uses a mix of Hindi and Urdu phrases, such as "क्यो तो अஙरेust3 ये तो तो नर् याjour तo टा" and "मuo तर english Mutti तो घnityीथ खय़ि औरगऴा स". This code-switching between Hindi and Urdu is common in many Indian and Pakistani languages.

Additionally, the speaker occasionally uses English words and phrases, such as "teamwork" 