In [20]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama.chat_models import ChatOllama
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
from langchain_core.prompts import PromptTemplate
from urllib.parse import urlparse, parse_qs, urlunparse
from pytube import Playlist
import time
from langdetect import detect, DetectorFactory
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain.schema.output_parser import StrOutputParser
import re
import os
import json

In [21]:
llm = ChatOllama(model="llama3.1:latest", base_url="http://localhost:11434", reasoning=False, streaming=False, request_timeout=600.0)

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

In [None]:
# List of top 10 languages
COMMON_LANGUAGES = ['en', 'hi', 'es', 'zh-Hans', 'ar', 'fr', 'ru', 'pt', 'bn', 'de']

def clean_video_url(video_url):
    """Remove playlist parameter and return clean YouTube URL + video ID."""
    parsed_url = urlparse(video_url)
    query_params = parse_qs(parsed_url.query)

    if 'v' not in query_params:
        raise ValueError(f"Invalid YouTube video URL: {video_url}")

    clean_query = f"v={query_params['v'][0]}"
    cleaned_url = urlunparse(parsed_url._replace(query=clean_query))
    return cleaned_url, query_params['v'][0]


def save_transcript(video_id, lang, transcript_data, output_dir="transcripts"):
    """
    Save transcript data to both text (.txt) and JSON (.json) files.
    The JSON file preserves timestamps for later search or analysis.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save plain text version (for readability)
    text_path = os.path.join(output_dir, f"{video_id}_{lang}.txt")
    with open(text_path, "w", encoding="utf-8") as f:
        for snippet in transcript_data:
            f.write(f"[{snippet['start']:.2f}s] {snippet['text']}\n")
    print(f"üíæ Saved transcript text: {text_path}")

    # Save structured JSON version
    json_path = os.path.join(output_dir, f"{video_id}_{lang}.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(transcript_data, f, ensure_ascii=False, indent=2)
    print(f"üíæ Saved structured transcript JSON: {json_path}")


def fetch_multilingual_transcripts(video_url, retries=3, delay=5):
    """
    Fetch transcripts for multiple languages.
    Returns a dictionary {language_code: transcript_data_list}.
    Also saves each transcript to text and JSON.
    """
    clean_url, video_id = clean_video_url(video_url)
    transcripts_data = {}

    for lang in COMMON_LANGUAGES:
        for attempt in range(1, retries + 1):
            try:
                api = YouTubeTranscriptApi()
                transcript_list = api.fetch(video_id, languages=[lang])

                # Convert to a list of structured dictionaries
                structured_transcript = [
                    {
                        "start": snippet.start,
                        "duration": snippet.duration,
                        "text": snippet.text.strip()
                    }
                    for snippet in transcript_list
                ]

                transcripts_data[lang] = structured_transcript
                save_transcript(video_id, lang, structured_transcript)
                break  # success, move to next language

            except TranscriptsDisabled:
                print(f"‚ùå Captions are disabled for video: {clean_url}")
                return None
            except NoTranscriptFound:
                # Language not available
                break
            except Exception as e:
                print(f"‚ö†Ô∏è Error fetching {clean_url} [{lang}], attempt {attempt}/{retries}: {e}")
                time.sleep(delay)

    if transcripts_data:
        print(f"‚úÖ All available transcripts fetched for video: {video_id}")
        return transcripts_data
    else:
        print(f"‚ùå No transcripts available for video: {video_id}")
        return None


# --- Main Program ---

all_transcripts = {}

choice = input("Do you want to fetch a single video or a playlist? (video/playlist): ").strip().lower()

if choice == "video":
    video_url = input("Enter the YouTube video URL: ").strip()
    transcripts = fetch_multilingual_transcripts(video_url)
    if transcripts:
        all_transcripts[video_url] = transcripts
        print("\n=== Combined transcripts (simplified text view) ===")
        for lang, data in transcripts.items():
            print(f"\n--- {lang} ---")
            for snippet in data[:10]:  # print first 10 lines only
                print(f"[{snippet['start']:.2f}s] {snippet['text']}")

elif choice == "playlist":
    playlist_url = input("Enter the YouTube playlist URL: ").strip()
    playlist = Playlist(playlist_url)
    for video_url in playlist.video_urls:
        transcripts = fetch_multilingual_transcripts(video_url)
        if transcripts:
            all_transcripts[video_url] = transcripts
        time.sleep(5)

else:
    print("Invalid choice. Enter 'video' or 'playlist'.")

üíæ Saved transcript text: transcripts\bdpyQm5l78o_en.txt
üíæ Saved structured transcript JSON: transcripts\bdpyQm5l78o_en.json
‚úÖ All available transcripts fetched for video: bdpyQm5l78o

=== Combined transcripts (simplified text view) ===

--- en ---
[0.08s] it is important to not sulk now you
[2.76s] could call me a monk who did not sell
[4.72s] her Ferrari even when you go in an
[6.88s] airplane when they're giving that you
[8.96s] know emergency uh talk they say when
[12.52s] there will be a oxygen mask you put on
[14.60s] yourself first and on your child later
[17.32s] just say I get to do this right I get to
[19.60s] go to work I get to see my friends how
[21.96s] much is it changing your life yeah no


In [None]:
docs_with_meta = []

for video_url, lang_dict in all_transcripts.items():
    # Extract video ID
    parsed_url = urlparse(video_url)
    video_id = parse_qs(parsed_url.query).get("v", [""])[0]

    for lang, transcript_list in lang_dict.items():
        for snippet in transcript_list:
            metadata = {
                "video_id": video_id,
                "language": lang,
                "start_time": snippet["start"],
                "end_time": snippet["start"] + snippet["duration"]
            }
            docs_with_meta.append({
                "text": snippet["text"].strip(),
                "metadata": metadata
            })

‚úÖ Total transcript snippets prepared: 1104


In [32]:
splitter = RecursiveCharacterTextSplitter(
     chunk_size = 500, 
     chunk_overlap = 100, 
     separators=["\n\n", "\n", ".", "‡•§", "ÿü", "!", "„ÄÇ", "Ôºå"] )

In [33]:
chunks = []

for doc in docs_with_meta:
    split_docs = splitter.create_documents(
        [doc["text"]],
        metadatas=[doc["metadata"]]
    )
    chunks.extend(split_docs)

In [35]:
filtered_chunks = [
    c for c in chunks
    if c.page_content.strip() and not c.page_content.startswith("---")
]

In [None]:
unique_chunks = []
seen_texts = set()

for c in filtered_chunks:
    text = c.page_content.strip()
    if text and text not in seen_texts:
        unique_chunks.append(c)
        seen_texts.add(text)

‚úÖ Unique cleaned chunks ready: 1103


In [36]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v2", 
    model_kwargs={"trust_remote_code": True}
)


In [38]:
texts_for_chroma = [c.page_content for c in unique_chunks]
metadatas_for_chroma = [c.metadata for c in unique_chunks]

In [None]:
vector_store = Chroma.from_texts(
    texts=texts_for_chroma,
    embedding=embeddings,
    metadatas=metadatas_for_chroma
)

‚úÖ Chroma vector store created successfully.


In [47]:
retriever = vector_store.as_retriever(search_type= 'similarity', search_kwargs={'k':10})

In [62]:
query = input('Enter your query here : ')
retrieved_docs = retriever.invoke(query)

In [63]:
# Example: rerank top 5 retrieved documents
scores = cross_encoder.predict([[query, d.page_content] for d in chunks[:5]])

# 9Ô∏è‚É£ Sort documents by CrossEncoder scores descending
reranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]

# 10Ô∏è‚É£ Show reranked results
for i, doc in enumerate(reranked_docs, 1):
    print(f"\n--- Rank {i} ---")
    print(f"Score: {scores[i-1]:.4f}")
    print(doc.page_content[:500])  # first 500 chars



--- Rank 1 ---
Score: -11.1153
saana is a doctor philanthropist

--- Rank 2 ---
Score: -11.2455
why did I want to be a doctor yes um

--- Rank 3 ---
Score: -11.1647
up to my father who's an amazing doctor

--- Rank 4 ---
Score: -11.1469
even in the video they show that the

--- Rank 5 ---
Score: -11.1549
handsome and still he was a doctor not a


In [64]:
prompt = PromptTemplate(
    input_variables=['context_text', 'query'],
    template='context_text:{context_text} \n question:{query}'
)


In [65]:
def format_docs(retrieved_docs):
    context_text = '\n\n'.join(doc.page_content for doc in reranked_docs)
    return context_text

context_text = format_docs(retrieved_docs)

In [66]:
parellel_chain = RunnableParallel({
    'context_text' : retriever | RunnableLambda(format_docs),
    'query' : RunnablePassthrough()
})

In [54]:
chain_new = parellel_chain | prompt | llm | StrOutputParser()
chain_new

{
  context_text: VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000209DA328410>, search_kwargs={'k': 10})
                | RunnableLambda(format_docs),
  query: RunnablePassthrough()
}
| PromptTemplate(input_variables=['context_text', 'query'], input_types={}, partial_variables={}, template='context_text:{context_text} \n question:{query}')
| ChatOllama(model='llama3.1:latest', reasoning=False, base_url='http://localhost:11434')
| StrOutputParser()

In [None]:
chain_new.invoke(query)

'The term "doctor philanthropist" was not specifically mentioned as being discussed in the video. However, it is mentioned that saana\'s father is an "amazing doctor", which implies that his profession and possibly some of his qualities or actions (implied by "still he was a doctor") might have had an impact on saana\'s decision to become a doctor.'