In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
import youtube_transcript_api
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from youtube_transcript_api import YouTubeTranscriptApi
import uuid
from langchain_chroma import Chroma


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transcript=YouTubeTranscriptApi()
subtitles=""
script=transcript.fetch(video_id='-8NURSdnTcg')



In [4]:
script.snippets

[FetchedTranscriptSnippet(text='here is the lore of every main boss in', start=0.24, duration=4.32),
 FetchedTranscriptSnippet(text='Shadow of the earth tree and like the', start=2.679, duration=4.441),
 FetchedTranscriptSnippet(text='base game it is all interconnected I', start=4.56, duration=5.48),
 FetchedTranscriptSnippet(text='mean ba injured placito saxs Moog is', start=7.12, duration=4.88),
 FetchedTranscriptSnippet(text='somehow a victim and the mother of', start=10.04, duration=4.24),
 FetchedTranscriptSnippet(text='fingers reveals a Twist that actually', start=12.0, duration=3.96),
 FetchedTranscriptSnippet(text='was said to us when the game first came', start=14.28, duration=3.88),
 FetchedTranscriptSnippet(text='out except we need to go back at the', start=15.96, duration=4.0),
 FetchedTranscriptSnippet(text="start and let's talk about the main", start=18.16, duration=4.32),
 FetchedTranscriptSnippet(text='villain Mesmer is the son of America and', start=19.96, duration=5.3

In [None]:
!pip install grandalf

Collecting grandalf
  Using cached grandalf-0.8-py3-none-any.whl.metadata (1.7 kB)
Collecting pyparsing (from grandalf)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Using cached grandalf-0.8-py3-none-any.whl (41 kB)
Downloading pyparsing-3.2.5-py3-none-any.whl (113 kB)
Installing collected packages: pyparsing, grandalf

   ---------------------------------------- 0/2 [pyparsing]
   ---------------------------------------- 2/2 [grandalf]

Successfully installed grandalf-0.8 pyparsing-3.2.5


In [12]:
from langchain_anthropic import ChatAnthropic
import os
from ai_core.prompts import conditional_prompt,summarizing_prompt
from dotenv import load_dotenv
from langchain import tools
from langgraph.graph import START,END,StateGraph
from langchain_core.documents import Document
from database.vectorstore import Retriever,Store
from ai_core.utils import PreProcessing
from typing import TypedDict,Optional,List
from pydantic import BaseModel
load_dotenv()



os.environ['ANTHROPIC_API_KEY']=os.getenv('ANTHROPIC_API_KEY')

"""If the question is "Broad/Topic":
Perform a vector search only on your summarized DB. This will quickly find the summaries that are conceptually related to the user's question.
From the metadata of the matching summary documents, extract the list of original chunk IDs (raw_chunks_id).
Use those IDs to fetch the full, original documents directly from your raw DB.
These original, detailed documents become the context you send to the final LLM."""


"""If the question is "Specific/Factual":
Ignore the summary DB. You need the raw details.
Perform a Hybrid Search (Vector Search + Keyword Search) directly on your raw DB.
The keyword search is essential here. It will find the exact variable names, commands, or proper nouns that a pure vector search might miss.
These retrieved raw documents become the context.
"""

class Youtube(TypedDict):
    video_id:str
    question:str
    documents:List[Document]


# class Docs(BaseModel):
#     raw_docs:PreProcessing
#     summ_docs:PreProcessing
#     video_id:str


def ingesting_video(state:Youtube):
    """Does the basic ingesting """

    assert state['vidoe_id']

    preprocess=PreProcessing(state['video_id'])
    preprocess.transcribing_video()
    raw_docs=preprocess.organising_transcript()
    summarized_docs=preprocess.summarizing_transcript(raw_docs)

    db_store = Store()
    db_store.ingesting_raw_docs(raw_docs)
    db_store.ingesting_summarized_docs(summarized_docs)



def condition(state:Youtube):
    assert state['question']

    model=ChatAnthropic(model='claude-sonnet-4-20250514')

    prompt=conditional_prompt()

    chain=prompt | model

    response=chain.invoke({'text':state['question']})

    if response.content.lower()=='specific_question':
        return 'raw'
    
    if response.content.lower()=='summary_request':
        return 'summary'
    

    
    

# @tools
def vector_search_for_summ_db(state:Youtube):
    """Uses vector search tool only for summarized DB From the metadata of the matching summary documents, 
    extract the list of original chunk IDs (raw_chunks_id).
    """

    summ_retriever=Retriever().summarized_retriever(k=3)

    response=summ_retriever.invoke(state['question'])
    state['documents'].append(response)
    return state


# @tools
def vector_search_for_raw_db(state:Youtube):
    """Uses vector search tool only for summarized DB From the metadata of the matching summary documents, 
    extract the list of original chunk IDs (raw_chunks_id).
    """

    raw_retriever=Retriever().raw_retriever(k=3)

    response=raw_retriever.invoke(state['question'])

    
    state['documents'].append(response)
    return state


def generate_response(state:Youtube):

    model=ChatAnthropic(model='claude-sonnet-4-20250514')
    prompt=summarizing_prompt()

    chain=prompt | model


    response=chain.invoke({'docs':state['documents']})

    return response.content






graph=StateGraph(Youtube)

graph.add_node('ingestion',ingesting_video)
graph.add_node('raw_db',vector_search_for_raw_db)
graph.add_node('summ_db',vector_search_for_summ_db)
graph.add_node('generate_response',generate_response)

graph.add_edge(START,'ingestion')

graph.add_conditional_edges('ingestion',condition,
                            {
                                'raw':'raw_db',
                                'summary':'summ_db'
                            })

graph.add_edge('raw_db','generate_response')

graph.add_edge('summ_db','generate_response')

graph.add_edge('generate_response',END)

app=graph.compile()




    
    
    # query=input('Enter your query')

    
    # result=app.invoke({
    #     'video_id':'-8NURSdnTcg',
    #     'documents':[],
    #     'question':query
    # })

    # print(result)

In [13]:

app.get_graph().draw_ascii()


ImportError: Install grandalf to draw graphs: `pip install grandalf`.

In [9]:
!pip install grandalf



In [11]:
!pip show grandalf

Name: grandalf
Version: 0.8
Summary: Graph and drawing algorithms framework
Home-page: https://github.com/bdcht/grandalf
Author: Axel Tillequin
Author-email: bdcht3@gmail.com
License: GPLv2 | EPLv1
Location: C:\Users\rawat\anaconda3\envs\yt_summarizer\Lib\site-packages
Requires: pyparsing
Required-by: 


In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
from langchain_anthropic import ChatAnthropic
import youtube_transcript_api
from ai_core.prompts import dividing_prompt
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from youtube_transcript_api import YouTubeTranscriptApi
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['ANTHROPIC_API_KEY']=os.getenv('ANTHROPIC_API_KEY')
import uuid
from langchain_chroma import Chroma

"""PREPROCESSING AND CONVERTING TO RAW AND SUMM DOCS"""

"""ADDING TIMESTAMP LATER ON"""


model=ChatOllama(model='llama3')

prompt=ChatPromptTemplate.from_template(
    '''You are a summarizer agent who will summarize the given text AND ALSO JUST GIVE SUMMARY DONT TRY TO MAKE CONVERSATION LIKE 'HERE IS THE SUMMARY TYPE' THING AND ALSO MAKE SUMMARY SUCH THAT MAIN THINGS ARE NOT LOST 
    {doc}
    '''
)

chain=prompt | model


class PreProcessing:
    """Uses basic principle of ingestion and preparing transcript"""

    def __init__(self,video_id):
        """Initializes Transcription for the video using video_id"""
        self.video_id=video_id
        self.transcript=""
    

    def transcribing_video(self):
        """Transcibes video like converting given yt video into text"""
        transcript=YouTubeTranscriptApi()
        script=""
        try:
            script=transcript.fetch(video_id=self.video_id)



        except Exception as e:
            print(f'Transcript failure for the video_id {self.video_id} Reason may be video_id is wrong The real error is {e}')

        self.transcript=script
        print('Transcription is done')
    
    def organising_transcript(self):
        """Converting into splitted text using Recursive Text Splitter"""
        model=ChatAnthropic(model='claude-sonnet-4-20250514',max_tokens_to_sample=4096)

        prompt=dividing_prompt()

        chain=prompt | model

        response=chain.invoke({'transcript':str(self.transcript)})

        return response.content



In [36]:
preprocess=PreProcessing(video_id='-8NURSdnTcg')
preprocess.transcribing_video()

response=preprocess.organising_transcript()


Transcription is done


In [38]:
response


'[\n  {\n    "title": "Introduction and Mesmer the Impaler",\n    "summary": "The video begins by introducing the interconnected lore of Shadow of the Erdtree\'s main bosses. The focus starts with Mesmer, son of Marika and elder brother to possibly Melina, who was cursed from birth with the Abyssal Serpent that ate away at his kindling. Marika sealed the curse by plucking out his right eye and placing a seal of Grace, then sent him on a crusade to the shadow realm to purge the hornsent people who had murdered her bloodline. Mesmer was accompanied by shunned allies including Commander Gaius and his army, as well as knights from renowned families and Rellana. After killing countless hornsent and impaling them on spears, Mesmer was abandoned by his mother, leading him to desperately pluck out his own eye and release the Abyssal Serpent in his final moments while calling for his mother.",\n    "start_time": "00:00:00",\n    "end_time": "00:01:57"\n  },\n  {\n    "title": "The Dancing Lion"

In [39]:
s=json.loads(response)

In [44]:
s[10]

{'title': 'Miquella and the Final Revelation',
 'summary': "Miquella is an Empyrean with the ability to charm others and manipulate them into doing his bidding. His goal was to be reborn as a god, for which he abandoned parts of his body throughout the shadow realm, including his other self and loving part - St. Trina, who desires his death, believing godhood would be a prison. To ascend at the Divine Gateway where Marika also ascended, Miquella needed both a Lord and a vessel. He charmed Mohg to enter the shadow realm and used his body as a vessel. For a Lord's soul, he had wished since childhood for Radahn to be his consort, but when Radahn refused, Miquella sent Malenia to kill him (revealed to say 'Miquella awaits thee, O promised consort'). After the player kills both Mohg and Radahn, Miquella places Radahn's soul into Mohg's body, creating his consort to usher in an age of compassion, though the player ultimately fulfills St. Trina's wish by killing them both.",
 'start_time': '0

In [None]:
a

"[\n  {\n    \"title\": \"Introduction to the Universe and Its Origins\",\n    \"summary\": \"The narrator introduces the fundamental question of how the universe began, encompassing everything from the tiniest particles to the largest galaxies. This chapter establishes the scope of the topic, acknowledging that while multiple scientific theories and creation myths have attempted to explain the universe's mysterious genesis, the Big Bang theory has emerged as the most widely accepted scientific explanation for the origin of everything including space, time, matter, and life itself.\",\n    \"start_time\": \"00:00:01\",\n    \"end_time\": \"00:00:38\"\n  },\n  {\n    \"title\": \"The Big Bang Theory and Initial Explosion\",\n    \"summary\": \"This chapter explains the core concepts of the Big Bang theory, describing how the universe began as an infinitely dense, hot point only a few millimeters wide, similar to a supercharged black hole. The narrative details how this tiny singularity 