In [1]:
import os
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [2]:
load_dotenv()
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Initialize the Chroma vector store
vectorstore = Chroma(
    persist_directory="data/chroma/regulondb", 
    embedding_function=embeddings,
    collection_name="promotores"
)

  vectorstore = Chroma(


In [None]:
# Initialize the retriever for the chatbot
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 100,
        # "fetch_k": 20,
        # "lambda_mult": 0.3,
    },
)

# Initialize the chatbot using OpenAI's GPT-3.5 Turbo model
chat = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# Initialize the retrieval-based question answering chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=chat,
    retriever=retriever,
    verbose=True
)

In [3]:
resultado = vectorstore.similarity_search(query="Listame todos los nombres de los promotores on el sigma factor sigma54")
resultado

[Document(metadata={'row': 2319, 'source': 'tablePromoters.tsv'}, page_content='promoter id: RDBECOLIPMC02319\npromoter name: ytfFp6\nstrand: reverse\nposition of Transcription Start Site (TSS): 4433131\nsigma factor: sigma54\npromoter sequence: aagccatctttttaatgttaataactagttaattaaagtggcatcctcccgcatcctctcTgataatgacgggatgccggg\nfirst gene: \ndistance to first gene: \nevidence: [COMP-AINF:W]\nAdditive Evidence: \nconfidence level (C: Confirmed, S: Strong, W: Weak): W\npmids associated to object: 14529615'),
 Document(metadata={'row': 1787, 'source': 'tablePromoters.tsv'}, page_content='promoter id: RDBECOLIPMC01787\npromoter name: yqeGp5\nstrand: forward\nposition of Transcription Start Site (TSS): 2985741\nsigma factor: sigma54\npromoter sequence: aaggaaatatgcctgagcagcagtcagagacataactggcacgtaaggtttgcaaccactAacccaccaatagaggggtag\nfirst gene: \ndistance to first gene: \nevidence: [COMP-AINF:W]\nAdditive Evidence: \nconfidence level (C: Confirmed, S: Strong, W: Weak): W\npmids associated t

In [6]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={'k': 150 })
resultado_retr = retriever.invoke("Listame todos los nombres de los promotores con el sigma factor sigma54")
resultado_retr

[Document(metadata={'row': 2319, 'source': 'tablePromoters.tsv'}, page_content='promoter id: RDBECOLIPMC02319\npromoter name: ytfFp6\nstrand: reverse\nposition of Transcription Start Site (TSS): 4433131\nsigma factor: sigma54\npromoter sequence: aagccatctttttaatgttaataactagttaattaaagtggcatcctcccgcatcctctcTgataatgacgggatgccggg\nfirst gene: \ndistance to first gene: \nevidence: [COMP-AINF:W]\nAdditive Evidence: \nconfidence level (C: Confirmed, S: Strong, W: Weak): W\npmids associated to object: 14529615'),
 Document(metadata={'row': 1787, 'source': 'tablePromoters.tsv'}, page_content='promoter id: RDBECOLIPMC01787\npromoter name: yqeGp5\nstrand: forward\nposition of Transcription Start Site (TSS): 2985741\nsigma factor: sigma54\npromoter sequence: aaggaaatatgcctgagcagcagtcagagacataactggcacgtaaggtttgcaaccactAacccaccaatagaggggtag\nfirst gene: \ndistance to first gene: \nevidence: [COMP-AINF:W]\nAdditive Evidence: \nconfidence level (C: Confirmed, S: Strong, W: Weak): W\npmids associated t

In [None]:
# Define the query to be used for the chatbot
# query = "sequence of yqhDp promoter"
# query = "cual es la sequencia del promotor ykiAp?"
# query = "esta sequencia tgacgccgtgcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacGcgtttttgtcatggctttgg es del promotor aroLp1"
query = "Listame todos los nombres de los promotores que esten regulados por el sigma factor sigma54"
chat_history = []
# Invoke the question answering chain with the query
result = qa_chain.invoke({"question": query, "chat_history": chat_history})

# Print the result
print(result)