In [1]:
# Install required packages
!pip install contractions faiss-cpu sentence-transformers rouge-score fastapi uvicorn joblib

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.12

In [2]:
# Import libraries
import pandas as pd
import re
import contractions
import joblib
import faiss

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from rouge_score import rouge_scorer

In [3]:
# 1. Load data
MedQuAD_df = pd.read_csv("/content/MedQuAD.csv")
correct_answers_df = pd.read_csv("/content/correct_answers.csv")
cse_qa_dataset_df = pd.read_csv("/content/QAData - Sheet1.csv")

In [4]:
# Keep only question & answer columns
MedQuAD_df = MedQuAD_df[['question', 'answer']]
correct_answers_df = correct_answers_df[['question', 'answer']]

cse_qa_dataset_df = cse_qa_dataset_df.rename(columns={'Question': 'question', 'Answer': 'answer'})
cse_qa_dataset_df = cse_qa_dataset_df[['question', 'answer']]

# Merge
combined_df = pd.concat([MedQuAD_df, correct_answers_df, cse_qa_dataset_df], ignore_index=True)
combined_df

Unnamed: 0,question,answer
0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...
...,...,...
49832,Does *OpenAIâ€™s GPT-4* comply with EU AI Actâ...,Yes â€“ general-purpose AI with systemic risk ...
49833,What is Toronto Declaration on ML fairness?,2018 framework for human rights impact assessm...
49834,Is ethical hacking legal without written consent?,No â€“ violates CFAA (US) and Computer Misuse ...
49835,Was Colonial Pipeline ransomware payment ($4.4...,Debated â€“ saved operations but funds crimina...


In [5]:
# 3. Light preprocessing
def preprocess_text_light(text):
    text = contractions.fix(str(text))
    text = re.sub(r'<.*?>', '', text)
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text

combined_df['processed_question'] = combined_df['question'].apply(preprocess_text_light)
combined_df['processed_sentence'] = combined_df['answer'].apply(preprocess_text_light)

In [6]:
# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings
corpus = combined_df['processed_sentence'].tolist()
corpus_embeddings = embedding_model.encode(corpus, convert_to_numpy=True)

# Create FAISS index
embedding_dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(corpus_embeddings)

# Save everything
faiss.write_index(index, "faiss_index.bin")
combined_df.to_csv("processed_dataset.csv", index=False)
joblib.dump("all-MiniLM-L6-v2", "embedding_model_name.pkl")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

['embedding_model_name.pkl']

In [7]:
qa_gen_model = pipeline("text2text-generation", model="google/flan-t5-base")

def rag_qa_faiss_generator_sampling(question, top_k=3, max_context_chars=400):
    question_processed = preprocess_text_light(question)
    q_embedding = embedding_model.encode([question_processed], convert_to_numpy=True)

    distances, indices = index.search(q_embedding, top_k)
    retrieved_passages = [corpus[i][:max_context_chars] for i in indices[0]]
    retrieved_context = " ".join(retrieved_passages)

    prompt = f"Answer the question based on the context below.\n\nContext: {retrieved_context}\n\nQuestion: {question}\nAnswer:"
    output = qa_gen_model(prompt, max_new_tokens=80, do_sample=True, temperature=0.7, top_p=0.9)
    return output[0]['generated_text'].strip()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [8]:
sample_question = "what is a sea?"
print("RAG Answer:", rag_qa_faiss_generator_sampling(sample_question))

RAG Answer: one or all of the major divisions of the planet's world ocean – they are, in descending order of area, the pacific , atlantic , indian , southern (antarctic) , and arctic oceans
