Desarrollo de Chatbot sin RAG

In [1]:
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModel,
)
import warnings
import time
import os
from dotenv import load_dotenv
from datasets import load_dataset
import torch
import pandas as pd

from pinecone import Pinecone
from pinecone import ServerlessSpec

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer_flat = T5Tokenizer.from_pretrained("google/flan-t5-large")
model_flat = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
input_text = "Please answer to the following question. What is the function of the viral KP4 protein?"

In [4]:
input_ids = tokenizer_flat(input_text, return_tensors="pt").input_ids

outputs = model_flat.generate(input_ids)
print(tokenizer_flat.decode(outputs[0]))

<pad> cytokine</s>


Desarrollo de Chatbot con RAG

In [35]:
# Uso de base de datos vectorial como pinecone
load_dotenv()
api_key = os.getenv("PINECONE_KEY")
pc = Pinecone(api_key=api_key)
spec = ServerlessSpec(cloud="aws", region="us-east-1")

index_name = "chatbot-rag"
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(name=index_name, dimension=384, metric="cosine", spec=spec)

while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

index = pc.Index(index_name)

In [26]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [36]:
dataset = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages", split='test')
dataset

Dataset({
    features: ['question', 'answer', 'relevant_passage_ids', 'id'],
    num_rows: 4719
})

In [11]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [12]:
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [37]:
# Indexar los datos
data = dataset.to_pandas().reset_index()

batch_size = 100

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i + batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['index']}" for i, x in batch.iterrows()]
    texts = [
        "Question: " + x["question"] + ", Answer: " + x["answer"]
        for _, x in batch.iterrows()
    ]
    embeds = [
        embed_text("Question: " + x["question"] + ", Answer: " + x["answer"])
        for _, x in batch.iterrows()
    ]
    metadata = [
        {"question": x["question"], "answer": x["answer"]} for i, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata))

In [38]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4719}},
 'total_vector_count': 4719}

In [39]:
# Hacer querys al index
query = input_text

query_embedding = embed_text(query)
results = index.query(vector=query_embedding.tolist(), top_k=3, include_metadata=True)
for result in results["matches"]:
    print(f"{round(result['score'], 2)}: {result['metadata']['answer']}")

0.82: The main function of PABPC4 is in mRNA stability and translation initiation. PABPC4 may also play a role in chronic inflammation and in the pathogenesis of colorectal cancer.
0.8: The virally encoded fungal toxin KP4  specifically blocks L-type voltage-gated calcium channels.
0.7: The protein function as an Na-K-Cl cotransporter.


In [21]:
retrieved_docs = [
    "Question: "
    + match["metadata"]["question"]
    + ", Answer: "
    + match["metadata"]["answer"]
    for match in results["matches"]
]

In [22]:
input_text = f"Please use this context {" ".join(retrieved_docs)} to answer the following question {query}"
input_ids = tokenizer_flat(input_text, return_tensors="pt").input_ids
outputs = model_flat.generate(input_ids)
response = tokenizer_flat.decode(outputs[0], skip_special_tokens=True)
response

'The virally encoded fungal toxin KP4 specifically blocks L-type voltage'

In [34]:
pc.delete_index(index_name)