In [7]:
import faiss
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import torch

RESULTS_DIR = Path("../results")







In [8]:
# Load document metadata
df = pd.read_csv(RESULTS_DIR / "retriever_documents.csv")

# Load prebuilt IVF FAISS index
index_path = RESULTS_DIR / "faiss_index_ivf.bin"
index = faiss.read_index(str(index_path))
index.nprobe = 10  # tune for speed vs accuracy
print("Documents and FAISS index loaded.")





Documents and FAISS index loaded.


In [9]:
# Embedding model on GPU if possible
try:
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")
except RuntimeError:
    print("Insufficient GPU memory. Using CPU for embedder.")
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

# Reranker on CPU to save GPU memory
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")

# Generator model (text-generation) on GPU if available
device_id = 0 if torch.cuda.is_available() else -1
generator = pipeline("text-generation", model="gpt2", device=device_id)

print("Models loaded successfully.")



Device set to use cuda:0


Models loaded successfully.


In [10]:
def rag_chat(query, top_k=3):
    # Encode query
    query_emb = embedder.encode([query], convert_to_numpy=True)
    
    # Retrieve top_k documents from FAISS
    distances, indices = index.search(query_emb, top_k)
    retrieved_docs = df.iloc[indices[0]]
    
    # Optional reranking (on CPU)
    pairs = [[query, doc] for doc in retrieved_docs["text"].tolist()]
    scores = reranker.predict(pairs)
    reranked = sorted(zip(retrieved_docs["text"], scores), key=lambda x: x[1], reverse=True)
    
    # Prepare context for generation
    context = "\n".join([doc for doc, _ in reranked[:top_k]])
    input_text = f"Answer the question using the following context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # Generate answer
    response = generator(input_text, max_length=200, num_return_sequences=1)[0]["generated_text"]
    return response



In [11]:
response = rag_chat("What is in the dataset?")
print(response)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer the question using the following context:
A data set is a collection of information organized as a stream of bytes in logical record and block structures for use by IBM mainframe operating systems. The record format is determined by data set organization, record format and other parameters. The physical structure of each record is nearly the same, and uniform throughout a data set. This is specified in the data control block record format parameter.
The data set lists values for each of the variables, such as height and weight of an object, for each member of the data set. Each value is known as a datum. The data set may comprise data for one or more members, corresponding to the number of rows. The term data set may also be used more loosely, to refer to the data in a collection of closely related tables, corresponding to a particular experiment or event. An example of this type is the data sets collected by space agencies performing experiments with instruments aboard space pr