In [59]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [4]:
# Create a new index.md file
with open('index.md', 'w') as f:
    f.write('''
            # OpenAI GPT-3 Playground
            This is a simple web interface for the OpenAI GPT-3 model. 
            You can enter a prompt and the model will generate a completion.
            ''')

In [5]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./index.md")
loader.load()

[Document(metadata={'source': './index.md'}, page_content='\n            # OpenAI GPT-3 Playground\n            This is a simple web interface for the OpenAI GPT-3 model. \n            You can enter a prompt and the model will generate a completion.\n            ')]

In [7]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv')
data = loader.load()

In [None]:
! pip install pypdf

Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
Installing collected packages: pypdf
Successfully installed pypdf-4.3.1


In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("example_data/meeting_data.pdf")
pages = loader.load_and_split()

In [9]:
pages[0]

Document(metadata={'source': 'example_data/meeting_data.pdf', 'page': 0}, page_content='Reunión de planificación - 15 de agosto de 2024\nAsistentes:\n- Juan Pérez\n- María González\n- Luis Fernández\n- Ana Martínez\nAgenda:\n1. Revisión de los objetivos del trimestre pasado.\n2. Planificación de las próximas actividades.\n3. Asignación de responsabilidades.\n4. Preguntas y respuestas.\nConclusiones:\n- Se logró el 85% de los objetivos del trimestre pasado.\n- Se identificaron áreas de mejora en la comunicación interna.\n- Las actividades prioritarias para el próximo trimestre incluyen la mejora de procesos y la\ncapacitación del personal.\nLa próxima reunión está programada para el 20 de septiembre de 2024.')

Text Splitter

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # these are default values from the documentation
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# text splits for the document
chunks_pages = text_splitter.split_documents(pages)

Text embedding models

In [60]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [12]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

In [15]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

[0.0054084924049675465,
 -0.0005304546211846173,
 0.038991302251815796,
 -0.002948436886072159,
 -0.008942656219005585]

Vector stores

In [None]:
! pip install langchain-chroma

In [14]:
from langchain_chroma import Chroma

db_chroma = Chroma.from_documents(chunks_pages, embeddings_model)

In [37]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

raw_documents = PyPDFLoader("example_data/meeting_data.pdf").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
embeddings_model = OpenAIEmbeddings()
vector_store = Chroma.from_documents(documents, embeddings_model)

In [51]:
retriever = vector_store.as_retriever(search_type="mmr") # Maximum Marginal Relevance (MMR)


Crea un chain para a partir de un documento guardarlo en el vector store

In [52]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI

# Load the openai chat model
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)


SYSTEM_TEMPLATE = """
Answer the user's questions based on the below context. 
If the context doesn't contain any relevant information to the question, don't make something up and just say "I don't know":

<context>
{context}
</context>
"""
question_answering_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            SYSTEM_TEMPLATE,
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

document_chain = create_stuff_documents_chain(llm, question_answering_prompt)

In [55]:
# invoke the document chain to answer questions
from langchain_core.messages import HumanMessage

document_chain.invoke(
    {
        "context": docs,
        "messages": [
            HumanMessage(content="quiénes intervienen?")
        ],
    }
)

'En la reunión de planificación del 15 de agosto de 2024, intervinieron Juan Pérez, María González, Luis Fernández y Ana Martínez.'

In [58]:
from langchain import hub
from langchain.chains import create_retrieval_chain

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input":"quiénes intervienen en la reunión?"})
response['answer']

Number of requested results 20 is greater than number of elements in index 6, updating n_results = 6


'Los asistentes a la reunión son Juan Pérez, María González, Luis Fernández y Ana Martínez.'