In [1]:
import os
import glob
from typing import List
from langchain.chains import LLMChain
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.document_loaders import PyPDFLoader


from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
def create_chat(t=0.0):
    """
    Configura os objetos do AzureChatOpenAI para serem utilizados.
 
    Args:
        t (float): O valor da temperatura para determinar o comportamento da resposta, por padrão é 0.0.
   
    Returns:
        AzureChatOpenAI: O objeto de AzureChatOpenAi configurado.
    """
    llm_chat = AzureChatOpenAI(openai_api_base=os.getenv("OPENAI_API_BASE"),
                    openai_api_version=os.getenv("OPENAI_API_VERSION"),
                    openai_api_key=os.getenv("OPENAI_API_KEY"),
                    openai_api_type=os.getenv("OPENAI_API_TYPE"),
                    deployment_name=os.getenv("DEPLOYMENT_NAME"),
                    temperature=t
    )
    return llm_chat

In [3]:
# load pdf
loaders = [
    PyPDFLoader(r'C:\AI\Cursos\LangChain_Chat_Data\data\Docker_para_desenvolvedores.pdf'),
           PyPDFLoader(r'C:\AI\Cursos\LangChain_Chat_Data\data\Containers_com_Docker.pdf')
        ]   
           
docs = []

for loader in loaders:
    docs.extend(loader.load())

In [4]:
# split

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
)

splits = text_splitter.split_documents(docs)
# Total de chunks
len(splits)

336

In [5]:
def create_embeddings():
    embeddings = OpenAIEmbeddings(
        deployment = os.getenv("EMBEDDING_DEPLOYMENT_NAME"),
    )
    return embeddings

embedding = create_embeddings()

In [6]:
vectordb = Chroma.from_documents(
    
    documents=splits,
    embedding=embedding,
    #persist_directory=persist_directory
)

In [7]:
# Mesma quantidade de splits
print(vectordb._collection.count())

336


## Max Marginal Relevance MMR

Permite a possibilidade de import a diversidade nos resultados da pesquisa, pois busca encontrar tanto a relevância para a consulta
quanto a diversidade entre os resultados.

In [9]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [10]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [11]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [12]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]

In [13]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]

In [17]:
question = "O que ele diz sobre container?"

docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

docs_mmr[0].page_content[:100]

'Today, when I hear someone say “containers won’t work for us, ” I hear \nthe same voice that said “vi'

In [18]:
docs_mmr[1].page_content[:100]

'é sobreposto nessa pilha, ou seja, o container em questão\n⁵⁴https://en.wikipedia.org/wiki/Aufs\n⁵⁵htt'

In [19]:
docs_mmr[2].page_content[:100]

'7.6. Orquestrando containers Casa do Código \n \n \n \n \nFig. 7.11: WordPress instalação  \n \n \n \nExplora'