# Transcription Preprocessing & Storage

## Download and Extract Transcription

In [166]:
import os, re
import tempfile
import subprocess
import whisper
from unidecode import unidecode
from youtube_transcript_api import YouTubeTranscriptApi

from dotenv import load_dotenv
load_dotenv()


def get_video_title(youtube_url):
    try:
        result = subprocess.run(
            ["yt-dlp", "--get-title", youtube_url],
            capture_output=True, text=True, check=True
        )
        title = result.stdout.strip()
        return title
    except subprocess.CalledProcessError:
        return "transcription"

def sanitize_filename(title):
    title = unidecode(title)
    sanitized = re.sub(r'[^\w\s\-]', '', title)
    sanitized = sanitized.lower().strip().replace(' ', '_')
    return sanitized

def read_transcription(title):
    try:
        with open(f"./data_trans/{title}_transcript.txt", "r", encoding="utf-8") as f:
            cur_transcription = f.read()
            return cur_transcription
    except Exception as e:
        print(f"There was an error: {e}")

def extract_yt_direct(youtube_url):
    try:
        url = youtube_url
        video_id = re.search(r'.+?v=(.*)',url)[1]
        trans = YouTubeTranscriptApi.get_transcript(video_id)
        list_trans = []

        for chunk in trans:
            list_trans.append(chunk.get("text"))

        trans_fin = " ".join(list_trans).replace("xa0","")
        trans_fin_san = re.sub(r'[^\w\s\-]', '', trans_fin)
        trans_fin_san = re.sub(r'\s+', ' ', trans_fin_san)
        return trans_fin_san
    except Exception as e:
        return "error"

def extract_transcription(url=None):
    if url != None:
        youtube_url = url
    else:
        youtube_url = str(input("Enter a youtube url: "))
    
    # Get video title
    title = sanitize_filename(get_video_title(youtube_url))
    title = title[:38] #ensure suitability with pinecone index 

    # Check if transcript already in place
    if not os.path.exists(f"./data_trans/{title}_transcript.txt") or len(read_transcription(title))==0:
        # Case 1: Can extract transcript directly online
        transcription = extract_yt_direct(youtube_url=youtube_url)
        if "error" not in transcription and len(transcription) != 0:
            with open(f"./data_trans/{title}_transcript.txt", "w", encoding="utf-8") as f:
                    f.write(transcription)

        # Case 2: Extract audio & transcribe
        else:
            with tempfile.TemporaryDirectory() as temp_dir:
                audio_file_path = os.path.join(temp_dir, "audio.mp3")
                
                # Download only audio using yt-dlp
                subprocess.run([
                    "yt-dlp",
                    "--extract-audio",
                    "--audio-format", "mp3",
                    "--output", audio_file_path,
                    "--ffmpeg-location", r"C:\Users\ACER\Downloads\ffmpeg-master-latest-win64-gpl\bin",
                    youtube_url
                ], check=True)

                print("Downloaded file path:", audio_file_path)
                print("Exists?", os.path.isfile(audio_file_path))

                os.environ["PATH"] += os.pathsep + r"C:\Users\ACER\Downloads\ffmpeg-master-latest-win64-gpl\bin"
                print("PATH:", os.environ["PATH"])
                
                # Load Whisper model
                """remember to install cuda version that matches your gpu: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"""
                whisper_model = whisper.load_model("base", device="cuda")           
                print("Model device:", whisper_model.device)
                
                # Transcribe
                cur_transcription = whisper_model.transcribe(audio_file_path, fp16=True)["text"].strip()

                with open(f"./data_trans/{title}_transcript.txt", "w", encoding="utf-8") as f:
                    f.write(cur_transcription)

    return read_transcription(title), title

In [167]:
transcription,title = extract_transcription()

## Split Documents

In [168]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_document(title):
    path_to_doc = f"./data_trans/{title}_transcript.txt"
    loader = TextLoader(path_to_doc)
    text_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
    documents = text_splitter.split_documents(text_documents)
    return documents

## Set up Pine Cone DB & Store Vectors

In [169]:
from pinecone import Pinecone

pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY")
)

from langchain_openai.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

def store_vector_db(title,documents):
    title = title.replace("_","-")
    index_name = f"{title}-index"
    if index_name not in str(pc.list_indexes()):
        pc.create_index(index_name,dimension = 1536, metric = "cosine", 
                        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
                        deletion_protection="disabled")
        index = pc.Index(host = pc.describe_index(index_name).host)
        pinecone = PineconeVectorStore.from_documents(documents=documents, embedding=embeddings, index_name=index_name
    )
        return pinecone
    else:
        index = pc.Index(host = pc.describe_index(index_name).host)
        pinecone = PineconeVectorStore(index=index, embedding=embeddings)
        return pinecone

def reset_index(title):
    index_name = f"{title}-index"
    index = pc.Index(host= pc.describe_index(index_name).host)
    index.delete_index(delete_all=True)

12/16/2024 12:43:51 INFO Discovering subpackages in _NamespacePath(['d:\\Study\\UNIVERSITY\\OTHER COURSES\\random coding\\Portfolio Projects\\000. GITHUB\\YOUTUBE-RAG-MODEL\\.venv\\Lib\\site-packages\\pinecone_plugins'])
12/16/2024 12:43:51 INFO Looking for plugins in pinecone_plugins.inference
12/16/2024 12:43:51 INFO Installing plugin inference into Pinecone


In [170]:
documents = split_document(title)

store_vector_db(title,documents)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2193cb02780>

# Agent 1: Context-Based Querying Assistant

## Full Model Chain Construction

In [180]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from deep_translator import GoogleTranslator
from langdetect import detect


model = ChatOpenAI(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model_name="gpt-4o-mini"
)

parser = StrOutputParser()
translator = GoogleTranslator(source="vi", target="en")

"""-----------------------------------------------------------------"""

def retrieve_context(input):
    if detect(input) != "en":
        query = translator.translate(input)
        print(f"Translated query: {query}")
    else:
        query = input
        print(f"Original text already in English")
    
    context = store_vector_db(title,documents) \
                .as_retriever(search_type = "similarity_score_threshold",search_kwargs={'score_threshold': 0.2}) \
                .get_relevant_documents(query)
    compiled_docu = " ".join([doc.page_content for doc in context])
    print("Retrieved Context:", compiled_docu, "\n-----\n")
    return compiled_docu

def query_from_context_main():
    template = """
    Think step by step before answering the question based on the context below. If you can't find the answer in the context below, say that you don't know.

    **Context:** {context}

    **Question:** {question}
    """

    prompt = ChatPromptTemplate.from_template(template)
    retriever_step = RunnableLambda(retrieve_context)

    retriever = RunnableParallel(context = retriever_step, 
                                question=RunnablePassthrough(), 
                                #  language = RunnablePassthrough()
                                )
    chain = retriever | prompt | model | parser

    while True:
        query = str(input("Ask me a question (type 'exit' to quit): "))
        if query.lower() == 'exit':
                print("Goodbye!")
                break
        result = chain.invoke(query)
        print(result)
        print("\n","---"*20)

        subquery = str(input("Continue? (click Enter): "))
        if subquery.lower() == '':
             continue
        else:
             break

In [181]:
query_from_context_main()

Original text already in English


12/16/2024 01:06:30 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Retrieved Context: Jensen For the very first time in human history we are producing manufacturing intelligence like production Raw material comes in A lot of genius goes into that box And what comes out is intelligence thats refined Lukas Youre listening to Gradient Dissent a show about machine learning in the real world and Im your host Lukas Biewald Today on Gradient Dissent I interviewed a guest that Ive been looking forward to interviewing for quite a long time This is Jensen Huang who is the CEO and founder of NVIDIA If youve trained a machine learning model youve probably trained it on NVIDIA hardware We get into machine learning and we talk about his views on what the future holds This is a super fun interview and I really hope you enjoy it Lukas All right Well thanks so much for doing this We a little demo They called it toy Jensen at the last GTC keynote Basically its a robot But its a virtual robot otherwise known as an avatar It has computer vision it has speech AI and under

12/16/2024 01:06:32 INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Jensen Huang is the CEO and founder of NVIDIA.
Goodbye!
Translated query: exit


12/16/2024 01:06:51 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Retrieved Context: surrounded by problems not good news I happen to enjoy that I enjoy solving problems So I completely separate the financial success of the company from the importance of the work and doing impactful work Ive historically always done that whether the company is doing well or badly When we were doing badly particularly during the time when we bet the farm on accelerated computing we wanted every single chip to have the same architecture that I mentioned earlier the pressure on our financial performance was immense But I was equally as enthusiastic then and believed as much in the future as I do today Lukas Thats incredible You dont feel the outside pressure at all or are you able to separate yourself from it Jensen No as a public company youre going to feel a lot of outside pressure Some going to feel a lot of outside pressure Some investors are really artful in expressing their displeasure and criticism and some investors are understandably less patient But its our jo

12/16/2024 01:06:54 INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


I don't know.


: 

: 

# Agent 2: (Advanced) Summarization Assistant

In [132]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from textsum.summarize import Summarizer

model_name = "pszemraj/led-large-book-summary"
summarizer = Summarizer(
    model_name_or_path=model_name
)

def fetch_list_pinecone(title) -> pd.DataFrame:
    # Define index
    title_refined = title.replace("_","-")
    index_name = f"{title_refined}-index"

    index = pc.Index(name = index_name, host = pc.describe_index(index_name).host)

    # Define vector_id
    id_list = []
    for id in index.list():
        for i in id:
            id_list.append(i)

    # Fetch vector list including metadata
    fetch_list = index.fetch(ids=id_list)

    # Return df of id, text, vector values
    list_vec = []
    for key, content in fetch_list["vectors"].items():
        id = key
        text = content["metadata"]["text"]
        values = content["values"]
        list_vec.append([id,text,values])
    
    df_vec_extracted = pd.DataFrame(list_vec, columns=["id", "text", "values"])

    return df_vec_extracted

def preprocess_run_kmeans(df_vec_extracted, n = 8) -> pd.DataFrame:
    # Split vector values to a separate df to run kmeans
    df_vec_text = df_vec_extracted.iloc[:,:2]

    df_vec_val = df_vec_extracted.loc[:,["id","values"]]
    df_vec_val = pd.concat([df_vec_val[["id"]], pd.DataFrame(df_vec_val["values"].tolist())],axis=1)    

    n_clusters = n
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df_vec_val["cluster"] = kmeans.fit_predict(df_vec_val.iloc[:, 1:])
    df_vec_text_clustered = pd.merge(df_vec_text,df_vec_val["cluster"],right_index=True,left_index=True)
    
    # Return final clustered results in df form
    clustered_texts = df_vec_text_clustered.groupby("cluster")["text"].apply(" ".join).reset_index()
    return clustered_texts

def summarize_text(input):
    out_str = summarizer.summarize_string(input)
    return out_str

def pre_summarize_chunk(title):
    df_vec_extracted = fetch_list_pinecone(title)
    clustered_texts = preprocess_run_kmeans(df_vec_extracted, n = 8)
    clustered_texts["summarized_text"] = clustered_texts["text"].apply(lambda x: summarize_text(x))
    input_sum="; ".join(clustered_texts["summarized_text"].values)
    return input_sum

def summarize_main(title=title):
    summarize_temp = """
    **Summarize the main points** and organize the information into a coherent summary based on the following context:

    {context}

    Ensure the summary is concise, clear, and covers all key details.
    """

    summarize_prompt = ChatPromptTemplate.from_template(summarize_temp)

    input_sum = pre_summarize_chunk(title)

    # summarization chain of actions
    sum_chain = summarize_prompt | model | parser
    result = sum_chain.invoke(input_sum)
    print(result)
    return result

12/16/2024 12:03:01 INFO Loaded model pszemraj/led-large-book-summary to cuda


In [133]:
output = summarize_main(title);

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Summaries:   0%|          | 0/1 [00:00<?, ?it/s]

12/16/2024 12:04:46 INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The Author of the Epilogue is addressing community questions, starting with the strategic use of NVIDIA to accelerate machine learning model development. This initiative originated when three research teams sought NVIDIA's assistance for neural network models to compete in the ImageNet competition, coinciding with Alex Net’s record-breaking achievements in computer vision. This collaboration contributed to the emergence of deep learning.
In response to whether he plays video games, the Author admits he does not play but actively engages with gaming trends due to NVIDIA's partnerships with various gaming companies. He expresses skepticism about the metaverse being experienced through current head-mounted display technology.
NVIDIA’s approach to computing involves creating a universal GPU platform tailored for diverse industries, emphasizing the need for algorithm redesign based on specific applications. The company focuses on solving problems across various domains, utilizing multimodal

In [135]:
output

"The Author of the Epilogue is addressing community questions, starting with the strategic use of NVIDIA to accelerate machine learning model development. This initiative originated when three research teams sought NVIDIA's assistance for neural network models to compete in the ImageNet competition, coinciding with Alex Net’s record-breaking achievements in computer vision. This collaboration contributed to the emergence of deep learning.\nIn response to whether he plays video games, the Author admits he does not play but actively engages with gaming trends due to NVIDIA's partnerships with various gaming companies. He expresses skepticism about the metaverse being experienced through current head-mounted display technology.\nNVIDIA’s approach to computing involves creating a universal GPU platform tailored for diverse industries, emphasizing the need for algorithm redesign based on specific applications. The company focuses on solving problems across various domains, utilizing multimo