In [47]:
# %pip install llama_index
# %pip install llama-index-embeddings-huggingface
# %pip install llama-index-llms-ollama


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
import os
from llama_index.core import SummaryIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.agent import ReActAgent
from llama_index.core.tools.query_engine import QueryEngineTool

In [48]:
import re
directory_path = "C:/Users/gmsol/Desktop/videodescargas/transcripciones"
transcripciones = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

# Define the pattern for allowed characters in filenames
pattern = re.compile(r'^[a-zA-Z0-9_-]+$')

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9_-]', '', filename)

# Check and rename files if necessary
for transcripcion in transcripciones:
    filename = os.path.basename(transcripcion)
    if not pattern.match(filename):
        new_filename = sanitize_filename(filename)
        old_path = os.path.join(directory_path, filename)
        new_path = os.path.join(directory_path, new_filename)
        os.rename(old_path, new_path)
        print(f"Renamed '{filename}' to '{new_filename}'")

In [49]:
import os
import openai

os.environ["OPENAI_API_KEY"] = ""
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [50]:
import torch
# Verificar si la GPU está disponible
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU")

device = torch.device("cuda")


GPU is available


In [51]:
import os

# Crear una lista con los nombres de los ficheros en el directorio "transcripciones"


docs={}
for transcripcion in transcripciones:
    docs[transcripcion] = SimpleDirectoryReader(
        input_files=[f"C:/Users/gmsol/Desktop/videodescargas/transcripciones/{transcripcion}"],
    ).load_data()

In [52]:



documents = SimpleDirectoryReader("C:/Users/gmsol/Desktop/videodescargas/transcripciones").load_data()

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")



In [53]:
print(f"Loaded {len(docs)} docs")

Loaded 4 docs


In [54]:
Settings.llm.model

'gpt-3.5-turbo'

In [55]:
node_parser = SentenceSplitter()

# Build agents dictionary
agents = {}
query_engines = {}

# this is for the baseline
all_nodes = []

for idx, transcripcion in enumerate(transcripciones):
    nodes = node_parser.get_nodes_from_documents(docs[transcripcion])
    all_nodes.extend(nodes)

    if not os.path.exists(f"./indexes/{transcripcion}"):
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(
            persist_dir=f"./indexes/{transcripcion}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./indexes/{transcripcion}"),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=Settings.llm)
    summary_query_engine = summary_index.as_query_engine(llm=Settings.llm)

    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for questions related to specific aspects in relation to the topic:"
                    f" {transcripcion}. "
                    
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    "Useful for any requests that require a holistic summary"
                    f" of EVERYTHING about {transcripcion}. For questions about"
                    " more specific sections, please use the vector_tool."
                    
                ),
            ),
        ),
    ]

    agent = ReActAgent.from_tools(
    query_engine_tools, 
    verbose=True,
    system_prompt=f"""\
        You are a specialized agent designed to answer queries about {transcripcion}.
        You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
        """
    )
    agents[transcripcion] = agent
    query_engines[transcripcion] = vector_index.as_query_engine(
        similarity_top_k=2
    )

In [56]:

all_tools = []
for transcripcion in transcripciones:
    transcript_summary = (
        f"This content contains the transcription of a video called {transcripcion}. Use"
        f" this tool if you want to answer any questions about {transcripcion}.\n"
    )
    doc_tool = QueryEngineTool(
        query_engine=agents[transcripcion],
        metadata=ToolMetadata(
            name=f"tool_{transcripcion}",
            description=transcript_summary,
        ),
    )
    all_tools.append(doc_tool)

In [57]:
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

In [58]:
from llama_index.agent.openai import OpenAIAgent

top_agent = OpenAIAgent.from_tools(
    tool_retriever=obj_index.as_retriever(similarity_top_k=3),
    system_prompt=""" \
You are an agent designed to answer queries about a video transcriptions.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
    verbose=True,
)

In [64]:
response = top_agent.query("Segun el doctor del sueño, que podemos hacer para dormir mejor?")

Added user message to memory: Segun el doctor del sueño, que podemos hacer para dormir mejor?
=== Calling Function ===
Calling function: tool_UnaClaseconelDrdelSueotxt with args: {"input":"Segun el doctor del sueño, que podemos hacer para dormir mejor?"}
[1;3;38;5;200mThought: The user is asking for advice on how to sleep better according to the sleep doctor in the text UnaClaseconelDrdelSueotxt. I should use a tool to help me answer the question.
Action: vector_tool
Action Input: {'input': 'Consejos para dormir mejor según el doctor del sueño en UnaClaseconelDrdelSueotxt'}
[0m[1;3;34mObservation: Para dormir mejor según el doctor del sueño en "Una Clase con el Dr. del Sueño", se pueden seguir los siguientes consejos: evitar dormir boca arriba, optar por dormir de lado, realizar siestas cortas como complemento de una buena noche de sueño, desconectar antes de dormir mediante técnicas como la respiración o la meditación, y levantarse de la cama si no se logra conciliar el sueño para 

In [66]:
response = top_agent.query("De que habla el podcast del doctor del sueño")

Added user message to memory: De que habla el podcast del doctor del sueño
=== Calling Function ===
Calling function: tool_UnaClaseconelDrdelSueotxt with args: {"input":"De que habla el podcast del doctor del sueño"}
[1;3;38;5;200mThought: The current language of the user is: Spanish. I need to use a tool to help me answer the question.
Action: summary_tool
Action Input: {'input': 'UnaClaseconelDrdelSueotxt'}
[0m[1;3;34mObservation: The text delves into a conversation with a sleep expert covering various topics related to sleep, including the impact of light exposure on sleep quality, the importance of maintaining a consistent sleep routine, and the potential benefits of smart lighting solutions for mental health. The expert also touches on the effects of physical activity and sexual activity on sleep, emphasizing the significance of avoiding screen time before bedtime for better sleep quality.
[0m[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user'