# Get Captions

In [28]:
playlist_input = "https://www.youtube.com/playlist?list=PLmbUlSQHQlvhEqMXztLb2ZnLDvg20J2nD"

In [29]:
import re

playlist_id = None
# Regex to extract playlist ID from various YouTube URL formats
playlist_url_regex = r'(?:youtube\.com\/(?:[^\/]+\/.+?\/|(?:v|e(?:mbed)?)\/|.*[?&]list=)|youtu\.be\/)([^"&?\/ ]{11,})'

match = re.search(playlist_url_regex, playlist_input)
if match:
    playlist_id = match.group(1)

# If it's not a URL or the regex failed, assume the input is directly the playlist ID
if not playlist_id:
    playlist_id = playlist_input

print(f"Extracted Playlist ID: {playlist_id}")

Extracted Playlist ID: PLmbUlSQHQlvhEqMXztLb2ZnLDvg20J2nD


In [30]:
import sys
!{sys.executable} -m pip install --upgrade google-api-python-client
print("Installed google-api-python-client.")

Installed google-api-python-client.


In [31]:
from googleapiclient.discovery import build
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("YOUTUBE_API_KEY")
# Placeholder for your API key. Replace with your actual key.
YOUTUBE_API_KEY = api_key

youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
print("YouTube API service initialized.")

YouTube API service initialized.


In [36]:
max_videos = 100
print(f"Max videos to process set to: {max_videos}")

Max videos to process set to: 100


In [37]:
video_ids = []
next_page_token = None

while True:
    request = youtube.playlistItems().list(
        part="snippet",
        playlistId=playlist_id,
        maxResults=50, # Max results per page
        pageToken=next_page_token
    )
    response = request.execute()

    for item in response['items']:
        video_id = item['snippet']['resourceId']['videoId']
        video_ids.append(video_id)
        if len(video_ids) >= max_videos:
            break

    next_page_token = response.get('nextPageToken')
    if not next_page_token or len(video_ids) >= max_videos:
        break

# Truncate video_ids to ensure it does not exceed max_videos
video_ids = video_ids[:max_videos]

print(f"Found {len(video_ids)} video IDs.")
print("Video IDs:", video_ids)

Found 100 video IDs.
Video IDs: ['k5juTqk-OoA', 'VQo1Eut6jhA', 'hvHNbeTsneY', 'OgYJmEB7lQk', 'Zs7qVzVu8sA', '4raib3FSIdE', 'EHXRd4oNEqM', 'KA46LrBO7TM', '47KrW28W2YU', 'HEYfSKI-kWw', 'rK90XkzEvd0', 'dpwfv92XUIM', 'pbJIaUr-Sfg', 'c5C9wn6Kd-E', '1sS7XPc4PnQ', '14uO75YlYkA', 'k27pd5LLxEs', 'y3_4F0_PM_0', 'vaMPbvvFLXI', '5A2J7jSyTAo', 'mSD3QG-5Xzg', 'rnSRXbK7H4g', 'nBfKZpku4Vo', '5gcVK_XvxSQ', 'fngLDf_K0Lw', 'b5FAVx8h-MU', '3M0ziiToATw', 'Y0-NcZBwYxo', 'OlQ1ILcyXfk', 'fpQ1pmBX5Qk', 'Ed8NqOOzNkc', 'Wcrpxsc1sT0', 'CGC-y3ySlhA', 'DSeg7Hx_GLc', 'u5UbGUuvBnI', 'vtT-QtiuwxY', 'mla3BXFGXXk', 'JPpJhsUGn6w', 'JPKJiJJDBqU', 'fgF5u05wTYM', 'AlBJwRqfD8A', 'FsrIbZfjc38', 'dU4B3HrgBxM', 'TEJxyq-I2AM', 'MJbUW84XCho', '2jfISLM_ZDY', '7C2M9qDaMIU', '6lA_9iMf7Wk', '-YKYepbihEI', 'jkD0EzqUj4o', 'ciH6t0VMw-c', 'raFnAJ1g_tU', 'tTsR_1iMYXI', 's3RV5Yjze3Q', '7wS_pfMz90M', 'FTrMuF6A_bc', 'vSaYg7BF6Hg', 'bND2kvgqQ3E', 'XCidDCrvI1E', 'dInifOpWZqg', 'mi-RiCAEdRs', 'u4NwDAi5ab0', 'Ry_Fh9W7O6k', 'T-1eGfIE2FU', 'zaMR4v

**Reasoning**:
The next step is to create a directory named `captions` to store the downloaded caption files.



In [38]:
import os

captions_dir = "captions"
os.makedirs(captions_dir, exist_ok=True)
print(f"Directory '{captions_dir}' created or already exists.")

Directory 'captions' created or already exists.


In [None]:
import yt_dlp
import os

for video_id in video_ids:
    if os.path.join(captions_dir, '%(title)s-%(id)s.%(ext)s') not in os.listdir(captions_dir):
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['pt'],  
            'subtitlesformat': 'srt',
            'skip_download': True,
            'outtmpl': os.path.join(captions_dir, '%(title)s-%(id)s.%(ext)s'),
            'quiet': True,  # Set to True to suppress output
            'warnings': True
        }

        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info = ydl.extract_info(video_url, download=False)
                # Now try to download
                ydl.download([video_url])
                print(f"‚úì Successfully downloaded captions for: {video_id}")
                
        except Exception as e:
            print(f"‚úó Could not download captions for {video_id}. Error: {e}")

print(f"\nCaption download completed for {len(video_ids)} videos.")

# Proccess Captions

In [14]:
import os
import pysrt
from typing import Annotated, Literal, TypedDict
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from langchain_core.tools import tool
from langgraph.graph import StateGraph, START, END, MessagesState
from langgraph.prebuilt import ToolNode, tools_condition

### --- PARTE 1: EXTRA√á√ÉO AUTOM√ÅTICA DE IDs DOS ARQUIVOS BAIXADOS ---

In [15]:

import os
import re
import pysrt
from langchain_core.documents import Document # Importa√ß√£o atualizada
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

def extrair_id_do_nome_arquivo(nome_arquivo):
    """
    Busca o ID de 11 caracteres no final do nome do arquivo gerado pelo yt-dlp.
    """
    # Regex robusto para pegar o ID antes da extens√£o .srt ou .pt.srt
    match = re.search(r'-([a-zA-Z0-9_-]{11})\.(?:[a-z]{2}\.)?srt$', nome_arquivo)
    if match:
        return match.group(1)
    return None

def processar_captions_baixadas(captions_dir):
    documents = []
    print(f"\n--- üß† Alimentando C√©rebro: Processando {captions_dir} ---")
    
    # Embeddings Multil√≠ngue (Excelente para Portugu√™s)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

    if not os.path.exists(captions_dir):
        print(f"‚ùå Erro: A pasta {captions_dir} n√£o existe!")
        return None

    for filename in os.listdir(captions_dir):
        if filename.endswith(".srt"):
            video_id = extrair_id_do_nome_arquivo(filename)
            
            if not video_id:
                continue
                
            path_completo = os.path.join(captions_dir, filename)
            
            # Tenta UTF-8, se falhar vai para ISO-8859-1 (comum em legendas antigas)
            try:
                subs = pysrt.open(path_completo, encoding='utf-8')
            except:
                subs = pysrt.open(path_completo, encoding='iso-8859-1')

            # Criamos blocos de 10 linhas de legenda para dar contexto ao RAG
            step = 10
            for i in range(0, len(subs), step):
                chunk_subs = subs[i : i + step]
                texto = " ".join([s.text for s in chunk_subs])
                texto = re.sub(r'\s+', ' ', texto).strip()
                
                # Gerar link com timestamp real do v√≠deo
                start = chunk_subs[0].start
                segundos = (start.hours * 3600) + (start.minutes * 60) + start.seconds
                url_com_tempo = f"https://youtu.be/{video_id}?t={segundos}s"
                
                # Criar o objeto Document que o ChromaDB entende
                doc = Document(
                    page_content=texto,
                    metadata={
                        "url": url_com_tempo, 
                        "fonte": filename,
                        "video_id": video_id
                    }
                )
                documents.append(doc)
                
    if not documents:
        print("‚ö†Ô∏è Nenhuma legenda nova encontrada para processar.")
        return None

    # Salva no banco vetorial
    vector_db = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory="./db_clone"
    )
    
    print(f"‚úÖ Sucesso! {len(documents)} novos fragmentos de mem√≥ria integrados.")
    return vector_db


In [None]:
load_dotenv()

# --- CONFIGURA√á√ÉO DO KNOWLEDGE BASE ---
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_db = Chroma(persist_directory="./db_clone", embedding_function=embeddings)
retriever = vector_db.as_retriever(search_kwargs={"k": 5})

# --- DEFINI√á√ÉO DA FERRAMENTA (TOOL) ---
@tool
def pesquisar_memoria_renan(query: str) -> str:
    """Busca trechos de colunas, transcri√ß√µes e pensamentos do Renan Santos sobre um tema."""
    docs = retriever.invoke(query)
    # Formata como o Master Prompt espera (como mem√≥rias)
    contexto = "\n\n".join([f"[Trecho]: {d.page_content}" for d in docs])
    return contexto

# --- CONFIGURA√á√ÉO DO MODELO COM MASTER PROMPT ---
with open("prompt_clone.txt", "r", encoding="utf-8") as f:
    master_prompt = f.read()

# Substitua a cria√ß√£o do llm por:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.7,
    model_kwargs={"system_instruction": master_prompt} # Passando via kwargs
).bind_tools([pesquisar_memoria_renan])

# --- L√ìGICA DO GRAFO (AGENT NODES) ---

def chatbot_renan(state: MessagesState):
    # O Gemini precisa do hist√≥rico completo para decidir se a ferramenta j√° foi usada
    # Injetamos o Master Prompt como SystemMessage aqui para garantir a identidade
    messages = [SystemMessage(content=master_prompt)] + state["messages"]
    
    # Chamada ao modelo
    response = llm.invoke(messages)
    
    # IMPORTANTE: Se o modelo decidir usar uma ferramenta, 
    # a resposta vir√° com 'tool_calls'. O LangGraph precisa disso.
    return {"messages": [response]}
# --- MONTAGEM DO FLUXO (LANGGRAPH) ---
workflow = StateGraph(MessagesState)

# 1. Adicionar os N√≥s
workflow.add_node("chatbot", chatbot_renan)
workflow.add_node("tools", ToolNode([pesquisar_memoria_renan]))

# 2. Definir as conex√µes (Arestas)
workflow.add_edge(START, "chatbot")

# O 'tools_condition' decide: se o LLM pediu tool -> vai para 'tools', sen√£o -> termina
workflow.add_conditional_edges(
    "chatbot",
    tools_condition,
)

# Ap√≥s usar a ferramenta, ele volta para o chatbot para processar a informa√ß√£o
workflow.add_edge("tools", "chatbot")

# Compilar o Agente
app = workflow.compile()