In [6]:

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma

from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [122]:
caminhos = [
    "files/tuning.pdf",
    "files/comunicando.pdf",
    "files/smiles.pdf"
]

paginas = []
for caminho in caminhos:
    loader = PyPDFLoader(caminho)
    paginas.extend(loader.load())

recur_split = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
    separators= ["\n\n", "\n", ".", " ", ""]
)

documentos = recur_split.split_documents(paginas)

for i, doc in enumerate(documentos):
    doc.metadata["source"] = doc.metadata["source"].replace("/", "")
    doc.metadata["doc_id"] = i


In [123]:
import tempfile

diretorio = tempfile.mkdtemp()

embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [124]:
vectordb = Chroma.from_documents(
    documents=documentos,
    embedding=embeddings_model,
    persist_directory=diretorio
)

In [79]:
from langchain_ollama.chat_models import ChatOllama

llm = ChatOllama(
    model="mistral:latest"
)

In [125]:
from langchain.chains.retrieval_qa.base import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type="mmr"),
    verbose=True
)

In [100]:
pergunta = "Como instalar o Singularity?"

chat_chain.invoke({"query": pergunta})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Como instalar o Singularity?',
 'result': ' Instale o Singularity executando as seguintes comandos no terminal:\n1. Atualize o sistema: sudo apt-get update\n2. Instale o Singularity-CE: sudo apt-get install singularity-ce\n3. Verifique a versão instalada: singularity --version',
 'source_documents': [Document(metadata={'author': 'Lucas Abner', 'creationdate': '2025-07-10T11:05:46-07:00', 'creator': 'Microsoft Word', 'doc_id': 199, 'moddate': '2025-07-10T11:05:46-07:00', 'page': 1, 'page_label': '2', 'producer': 'PyPDF', 'source': 'filescomunicando.pdf', 'total_pages': 3}, page_content='que você possa usar o mesmo terminal para fazer outras requisições. Mas ainda falta baixar o \nmodelo LLM para requisitar, então vamos fazer: \nsingularity exec ollama.sif ollama run <modelo_llm> \nAgora sim, temos o modelo pronto na máquina remota para realizar requisições. \n \n \n4. Como Requisitar na sua Máquina? \nDa sua máquina, você vai realizar da mesma maneira que foi citado anteriorm

In [118]:
from langchain.prompts import PromptTemplate

chain_prompt = PromptTemplate.from_template(
    """Utilize o contexto fornecido para responder a pergunta ao final.
    Se você não sabe a resposta, apenas diga que não sabe e não invente uma resposta.
    Utilize três frases no máximo, mantenha a resposta concisa.

    Contexto: {context}

    Pergunta: {question}

    Resposta:
"""
)

In [126]:
from langchain.chains.retrieval_qa.base import RetrievalQA

chat_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type="mmr"),
    chain_type_kwargs={"prompt": chain_prompt},
    return_source_documents=True,
    verbose=True
)

In [144]:
pergunta = "Smile de pyridine N-oxide, do methylpyrrole e do cubane"

result = chat_chain.invoke({"query": pergunta})




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [145]:
print(result["result"])

 SMILES for Pyridine N-Oxide: O=nlcccccl
SMILES for Methylpyrrole: Cnlccccl
SMILES for Cubane: c1ccccc1


In [137]:
result["source_documents"]

[Document(metadata={'author': 'David Weininger', 'creationdate': '2001-08-20T19:36:17-04:00', 'creator': 'Acrobat Capture 3.0', 'doc_id': 269, 'moddate': '2009-08-12T09:26:31-04:00', 'page': 4, 'page_label': '5', 'producer': 'Adobe PDF Library 4.0', 'source': 'filessmiles.pdf', 'style': 'Searchable Image (Exact)', 'subject': 'J. Chem. Inf. Comput. Sci. 1988.28:31-36', 'title': 'SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules', 'total_pages': 6}, page_content='Pyridine Pyridine-N-oxide methyl and 1H-pyrrole \nPyridine is best written as nlcccccl; SMILES correctly de- \nduces that no hydrogens are attached to the nitrogen in pyr- \nidine because two aromatic bonds satisfy the normal valence \nof nitrogen. Leaving the problem of aromaticity detection \naside, neither the nitrogen in O=nlcccccl (pyridine N-oxide) \nnor the nitrogen in Cnlccccl (methylpyrrole) has a valency \nleft for an extra hydrogen at the normal valence state. In'),


In [146]:
from langchain.globals import set_debug

set_debug(True)

pergunta = "Smile de pyridine N-oxide, do methylpyrrole e do cubane"
resposta = chat_chain.invoke({"query": pergunta})

set_debug(False)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Smile de pyridine N-oxide, do methylpyrrole e do cubane"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Smile de pyridine N-oxide, do methylpyrrole e do cubane",
  "context": "Pyridine Pyridine-N-oxide methyl and 1H-pyrrole \nPyridine is best written as nlcccccl; SMILES correctly de- \nduces that no hydrogens are attached to the nitrogen in pyr- \nidine because two aromatic bonds satisfy the normal valence \nof nitrogen. Leaving the problem of aromaticity detection \naside, neither the nitrogen in O=nlcccccl (pyridine N-oxide) \nnor the nitrogen in Cnlccccl (methylpyrrole) has a valency \nleft for an extra hydrogen at the normal valence state. In\n\nwith

In [154]:
from langchain.chains.retrieval_qa.base import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type="mmr"),
    chain_type="refine",
    return_source_documents=True
)

pergunta = "VH S85N+R87T"
resposta = chain.with_config({"temperature": 0}).invoke({"query": pergunta})
print(resposta["result"])
print(resposta["source_documents"])

 In light of the additional context provided, the question can be more accurately answered as follows: The double mutants being referred to are HER2 protein variants with specific amino acid substitutions. These substitutions are located at distinct structural positions within the protein and include A40R+K43D, R50S+R59N, and S85N+R87T (Figure 2b). The four-rule subset of SMILES used for specifying these organic compounds includes only the symbols H, C, N, O, P, S, F, C1, Br, I, and parentheses. The rules are as follows: (1) Atoms are represented by atomic symbols, (2) side-chain interactions observed in MD simulation frames were analyzed, (3) probabilities of these substitutions were obtained from language models to enhance stability and function, and (4) although SMILES allows direct specification of charges, attached hydrogens, and aromaticity, they are often not required for the vast majority of organic compounds. In addition, side-chain interactions observed in the Q89A and Q89H M

{'query': 'VH S85N+R87T',
 'result': " In the provided VH S85N+R87T residues context, it is essential to mention that these mutations are part of a group of three stabilizing double-mutants in an antibody (VH) structure. The other two mutant pairs are A40R+K43D and R50S+R59N, which were prioritized due to consistent Rosetta results between single and double mutant scans.\n\nThese triple mutations were given priority because the models' probabilities for these specific substitutions were obtained from investigated language models. The SMILES notation is a text-based representation of molecules used in computational chemistry, and it was not directly related to the VH residue mutation context provided."}