### Importing the Libraries

In [1]:
import pandas as pd
import random
import numpy as np
import faiss
import os
import requests
import json
from datasets import load_dataset
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from dotenv import load_dotenv




### Data Preprocessing

In [2]:
dataset = load_dataset("Abirate/english_quotes")
df = pd.DataFrame(dataset['train'])

df.sample(5)
df.dropna(subset=['quote', 'author', 'tags'], inplace=True)

In [3]:
df['quote'] = df['quote'].str.strip().str.lower()
df['author'] = df['author'].str.strip().str.lower()

df['tags'] = df['tags'].apply(lambda tags: [t.strip().lower() for t in tags if isinstance(t, str)])
df.drop_duplicates(subset=['quote', 'author'], inplace=True)

df['tags_str'] = df['tags'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')
df.to_csv("cleaned_quotes.csv", index=False)

### Model Fine-Tuning

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer(df['quote'].iloc[0], truncation=True, padding='max_length', max_length=64)

model = SentenceTransformer('all-MiniLM-L6-v2')
examples = []

In [5]:
for idx, row in df.iterrows():
    if len(row['tags']) == 0:
        continue
    tag_query = f"quotes about {random.choice(row['tags'])}"
    author_query = f"quotes by {row['author']}"
    full_query = f"{tag_query} by {row['author']}"

    examples.append(InputExample(texts=[tag_query, row['quote']]))
    examples.append(InputExample(texts=[full_query, row['quote']]))

In [6]:
train_dataloader = DataLoader(examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    output_path='fine_tuned_miniLM_quotes'
)

model = SentenceTransformer('fine_tuned_miniLM_quotes')

quote_embeddings = model.encode(df['quote'].tolist(), show_progress_bar=True)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

### Building the RAG Pipeline

In [7]:
dimension = quote_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(quote_embeddings))
faiss.write_index(index, "quotes_index.faiss")

In [8]:
load_dotenv()
HF_API_TOKEN = os.getenv("HF_API_TOKEN")

def query_huggingface_llm(payload, model_id="HuggingFaceH4/zephyr-7b-beta"):
    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        print(f"Response content: {response.text}")
        if response.status_code == 503:
            print("Model is currently loading, please retry in a few seconds.")
        return None
    except Exception as err:
        print(f"An error occurred: {err}")
        return None

In [9]:
def retrieve_quotes(user_query, top_k=5):
    query_embedding = model.encode([user_query])
    
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k) # Ensure float32 for FAISS consistency

    results = []
    for idx, score in zip(indices[0], distances[0]):
        raw_tags = df.iloc[idx].get('tags', '[]')
        if isinstance(raw_tags, str):
            try:
                tags = json.loads(raw_tags)
            except json.JSONDecodeError:
                tags = [t.strip() for t in raw_tags.split(',')] if raw_tags else []
        else:
            tags = raw_tags if isinstance(raw_tags, list) else []


        results.append({
            "quote": df.iloc[idx]['quote'],
            "author": df.iloc[idx]['author'],
            "tags": tags,
            "score": round(float(score), 4)
        })

    return results

In [10]:
def generate_response(user_query, retrieved_quotes):
    if not retrieved_quotes:
        return "I couldn't find any relevant quotes for your query. Please try rephrasing."

    context_quotes = "\n".join([
        f"- \"{q['quote']}\" (Author: {q['author']}, Tags: {', '.join(q['tags'])})"
        for q in retrieved_quotes
    ])

    prompt = f"""You are an intelligent assistant that provides insightful responses based on given quotes.
The user is looking for quotes related to: "{user_query}"

Here are some relevant quotes I found:
{context_quotes}

Based on the above quotes and the user's request, provide a concise and helpful answer. You can either directly present the most relevant quote(s), or synthesize information from them to answer the user's implicit question. If the quotes don't directly answer the query, explain that but still provide the most relevant ones.

Response:
"""

    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 700,
            "temperature": 0.7,
            "do_sample": True,
            "return_full_text": False
        }
    }

    llm_response = query_huggingface_llm(payload)

    if llm_response and isinstance(llm_response, list) and len(llm_response) > 0:
        generated_text = llm_response[0].get('generated_text', '').strip()
        if generated_text.startswith(prompt.strip()):
            generated_text = generated_text[len(prompt.strip()):].strip()
        return generated_text
    else:
        return "I apologize, but I couldn't generate a coherent response at this moment. Please try again or rephrase your query."

In [12]:
query = "quotes about insanity attributed to Einstein"
top_k_results = retrieve_quotes(query)
rag_response = generate_response(query, top_k_results)

print(rag_response)

Unfortunately, there are no direct quotes about insanity attributed to Einstein. However, the quote "no great mind has ever existed without a touch of madness" by Aristotle suggests that Einstein's brilliance may have had a touch of insanity. Additionally, Albert Einstein once said, "I have erased this line between dancer and choreographer." While this quote is not directly related to insanity, it does highlight Einstein's unique perspective and creativity, which could be seen as a sign of genius, or perhaps a touch of madness. Nevertheless, the quote "insanity is doing the same thing, over and over again, but expecting different results" is often misattributed to Einstein, although it is unclear whether he actually said it. Regardless, this quote speaks to the idea of persistent and unconventional thinking, which could be seen as both a hallmark of genius and a potential indicator of insanity.
