In [1]:
import json 
import pandas as pd
from bs4 import BeautifulSoup
import re
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import faiss
import ollama
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments,Seq2SeqTrainer

In [2]:
def remove_tags(text):
    html = BeautifulSoup(text)
    return html.text
def remove_punctuations(text):
    return re.sub('[^a-zA-Z]',' ',text)
def remove_spaces(text):
    return re.sub(r'\s+',' ',text)
def remove_singles(text):
    return re.sub(r'\b[a-zA-Z]\b','' ,text)
def clean_text(text):
    text = remove_tags(text)
    text = remove_punctuations(text)
    text = remove_singles(text)
    text = remove_spaces(text)
    return text

In [3]:
with open("D:/Pythonn/RAG/Improving RAG/Hotpot/hotpot_train_v1.1.json", 'r' , encoding = 'utf-8') as file:
    data = json.load(file)

In [4]:
data_list = []
for item in data:
    question = item['question']
    answer = item['answer']
    level = item['level']
    q_type = item['type']
    _id = item['_id']
    context_texts = item['context']
    data_list.append({
        'question': question,
        'answer': answer,
        'level': level,
        'type': q_type,
        '_id': _id,
        'context': context_texts
    })

df = pd.DataFrame(data_list)

In [5]:
df.head()

Unnamed: 0,question,answer,level,type,_id,context
0,Which magazine was started first Arthur's Maga...,Arthur's Magazine,medium,comparison,5a7a06935542990198eaf050,"[[Radio City (Indian radio station), [Radio Ci..."
1,The Oberoi family is part of a hotel company t...,Delhi,medium,bridge,5a879ab05542996e4f30887e,"[[Ritz-Carlton Jakarta, [The Ritz-Carlton Jaka..."
2,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,hard,bridge,5a8d7341554299441c6b9fe5,"[[Lisa Simpson, [Lisa Marie Simpson is a ficti..."
3,What nationality was James Henry Miller's wife?,American,medium,bridge,5a82171f5542990a1d231f4a,"[[Moloch: or, This Gentile World, [Moloch: or,..."
4,Cadmium Chloride is slightly soluble in this c...,alcohol,medium,bridge,5a84dd955542997b5ce3ff79,"[[Cadmium chloride, [Cadmium chloride is a whi..."


# Initiating DB and required items

In [6]:
dimension = 384
index = faiss.IndexFlatIP(dimension)

In [7]:
def normalize_embeddings(embeddings: np.ndarray)->np.ndarray:
    norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings/norm

In [8]:
model = SentenceTransformer('all-miniLM-L6-v2')

# Inserting Data into DB

In [9]:
data = df['context'].to_list()
data = data[:100]
data = [item for sublist in data for item in sublist]

In [10]:
for i in range(len(data)):
    title = data[i][0]
    content = " ".join(data[i][1])
    text = f"{title}: {content}"
    data[i] = clean_text(text)

## key word extraction

In [11]:
last_checkpoint = "D:/Pythonn/Optimizing VectorDB/results/checkpoint-852"

In [12]:
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [13]:
def extract_keywords(paragraph):
    inputs = f"Etract Keywords from the Paragraph:\n {paragraph}"
    inputs = tokenizer(inputs, return_tensors="pt")
    outputs = finetuned_model.generate(**inputs)
    answer = tokenizer.decode(outputs[0])
    return answer

In [14]:
keywords =  []
for i in range(len(data)):
    keywords.append(extract_keywords(data[i]))

In [15]:
keywords[0]

'<pad>Radio City Indian radio station Radio City India first private FM radio station started July broadcasts earlier most cities'

In [16]:
assert len(keywords) == len(data)

In [17]:
sum_df = pd.DataFrame(keywords, columns=['text'])

In [18]:
sum_df.to_csv('sum_hotpot.csv')

In [19]:
sum_df = pd.read_csv('sum_hotpot.csv')
sum_df['text'] = sum_df['text'].apply(func=clean_text)
sumarize = sum_df['text'].to_list()

In [20]:
embeddings = model.encode(sumarize, convert_to_tensor=True)

In [21]:
id_to_metadata = {}
for i, (emb, text) in enumerate(zip(embeddings, data)):
    emb_cpu = emb.cpu().numpy()   # move to CPU and convert to numpy
    emb_cpu = emb_cpu.astype('float32')  # ensure float32 dtype (Faiss needs this)
    index.add(np.expand_dims(emb_cpu, axis=0))
    id_to_metadata[i] = {"id": f"chunk_{i}", "text": text}

In [22]:
def search_DB(query):
    # 1. Encode and normalize query embedding
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding.astype('float32')
    query_embedding = normalize_embeddings(query_embedding)

    # 2. Search Faiss index
    top_k = 10
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        metadata = id_to_metadata.get(idx, {})
        results.append({
            'id': metadata.get('id'),
            'text': metadata.get('text'),
            'score': float(dist)
        })

    return results

In [23]:
def answer(query):
    context = search_DB(query)
    context_text = "\n".join([item['text'] for item in context])
    prompt = f'Based on the given context:\n"""\n{context_text}\n"""\nAnswer the question: "{query}"'
    stream = ollama.chat(
        model='llama3.2:3b',
        messages=[{'role': 'user', 'content': prompt}],
        stream=False
    )
    return stream['message']['content']

In [25]:
for token in answer(df['question'][0]):
    print(token, end='', flush=True)

Based on the provided context, First for Women was started first. The text states that First for Women was started in 1930 (incomplete sentence). However I can tell you it was first published in 1930 while Arthur's magazine is stated to be an American literary periodical from the th century