# RAG for CSV Data

In [3]:
# Install Libraries
# for reading csv file
# for similarity search
# for arrays
# openai
!pip install numpy pandas openai faiss-cpu




In [4]:
import pandas as pd
import faiss
import numpy as np
from openai import OpenAI

In [5]:
#store data file in variable
data=pd.read_csv("data.csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Author's Name,Title,Publisher / Place,Year,Pages,Vol.,Cost
0,"Acharya,S. Chellappan,S.",Big Data and Analytics,Wiley-India,2015,334,,499
1,"Acharya,S. Chellappan,S.",Big Data and Analytics,Wiley-India,2017,334,,559
2,"Agarwal, R.K.",Engineering Chemistry,"Krishna Prakashan, Meerut",2013,496,,270
3,"Agarwal, R.K.",Engineering Chemistry,"Krishna Prakashan, Meerut",2013,496,,270
4,"Agarwal, R.K.",Engineering Chemistry,"Krishna Prakashan, Meerut",2013,434,,250
...,...,...,...,...,...,...,...
257,"Turban,E Sharda, R.",Decision support and business intelligence sys...,"Pearson Education, New Delhi",2015,672,,799
258,"Turban,E Sharda, R.",Decision support and business intelligence sys...,"Pearson Education, New Delhi",2017,672,,829
259,"Tyagi, Kavita Mishra,P.",Basic Technical Communication,"PHI, New Delhi",2013,267,,250
260,"Vaughan, Tay",Multimedia:Making it work,"Tata McGraw Hill, New Delhi",2012,465,,650


In [6]:
def generate_text_representation(row):
    author_name = row["Author's Name"]
    title = row["Title"]
    publisher = row["Publisher / Place"]
    year = row["Year"]
    return f"{author_name} - {title} - {publisher} - {year}"

df["text_representation"] = df.apply(generate_text_representation, axis=1)


In [7]:
df["text_representation"]

Unnamed: 0,text_representation
0,"Acharya,S. Chellappan,S. - Big Data and Analyt..."
1,"Acharya,S. Chellappan,S. - Big Data and Analyt..."
2,"Agarwal, R.K. - Engineering Chemistry - Krish..."
3,"Agarwal, R.K. - Engineering Chemistry - Krish..."
4,"Agarwal, R.K. - Engineering Chemistry - Krish..."
...,...
257,"Turban,E Sharda, R. - Decision support and bus..."
258,"Turban,E Sharda, R. - Decision support and bus..."
259,"Tyagi, Kavita Mishra,P. - Basic Technical Comm..."
260,"Vaughan, Tay - Multimedia:Making it work - Ta..."


In [8]:
client = OpenAI(
    api_key="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",
    base_url="https://integrate.api.nvidia.com/v1"
)

In [9]:
embeddings = []
for text in df["text_representation"].tolist():
    response = client.embeddings.create(
        input=[text],
        model="nvidia/nv-embedqa-e5-v5",
        encoding_format="float",
        extra_body={"input_type": "query", "truncate": "NONE"}
    )
    embeddings.append(response.data[0].embedding)


In [10]:
embedding_dim = len(embeddings[0])
embedding_array = np.array(embeddings).astype('float32')
embedding_dim = embedding_array.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embedding_array)

In [11]:
metadata_file = "metadata.npy"
index_file = "faiss_index"

In [12]:
np.save(metadata_file, df["text_representation"].tolist())
faiss.write_index(index, index_file)

In [13]:
def search_faiss(query, top_k=2):
    query_response = client.embeddings.create(
        input=[query],
        model="nvidia/nv-embedqa-e5-v5",
        encoding_format="float",
        extra_body={"input_type": "query", "truncate": "NONE"}
    )
    query_embedding = np.array([query_response.data[0].embedding]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    metadata = np.load(metadata_file, allow_pickle=True)
    results = [(metadata[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results


In [14]:
query = "Data Structures"
results = search_faiss(query)
print(f"Search Results for '{query}':")
for result, score in results:
    print(f"- {result} (Distance: {score:.2f})")

Search Results for 'Data Structures':
- Lipschutz, S.  - Schaum's Outline Data Structures - McGraw Hill Education (India) Pvt.Ltd. - 2,016 (Distance: 1.04)
- Lipschutz, S.  - Schaum's Outline Data Structures - McGraw Hill Education (India) Pvt.Ltd. - 2,015 (Distance: 1.05)


In [15]:
def beautify_results(results):
    beautified_output = []
    unique_books = {}

    for result, score in results:
        if result not in unique_books:
            unique_books[result] = {"info": result, "distance": score}

    for book in unique_books.values():
        book_info = book["info"]
        description = summarize_book(book_info)
        beautified_output.append({
            "book": book_info.split(' - ')[1],
            "author": book_info.split(' - ')[0],
            "publisher": book_info.split(' - ')[2],
            "year": book_info.split(' - ')[3],
            "description": description
        })

    return beautified_output


In [16]:
def summarize_book(book_info):
    prompt = f"Provide a concise and clear description for the following book: {book_info}. Avoid introductory phrases like 'Here is a concise description of the book' and focus on the core content."
    response = client.chat.completions.create(
        model="meta/llama-3.1-8b-instruct",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        top_p=0.7,
        max_tokens=100,
        stream=False,
    )

    description = response.choices[0].message.content.strip()

    unwanted_phrases = [
        "Here is a concise description of the book:",
        "Here is a brief summary of the book:",
        "This is a brief description of the book:",
        "This book,"
    ]
    for phrase in unwanted_phrases:
        if description.startswith(phrase):
            description = description.replace(phrase, "").strip()

    return description


In [17]:
from IPython.display import display, Markdown

def format_results(beautified_results):
    for book in beautified_results:

        formatted_result = f"""
---
**Book:** {book['book']} <br>
**Author:** {book['author']} <br>
**Publisher:** {book['publisher']} <br>
**Year:** {book['year']} <br>
**Description:** {book['description']} <br>
"""

        display(Markdown(formatted_result))

In [18]:
def summarize_results(beautified_results):

    output = []
    for idx, book in enumerate(beautified_results, start=1):
        output.append(f"{book['book']} by {book['author']}")

    return "Hello VIPS Trainees, I got these results:\n" + "\n".join(output)

In [19]:
query = "organic chemistry"
results = search_faiss(query)

In [20]:
beautified_results = beautify_results(results)

In [21]:
print(f"Search Results for '{query}':\n")
format_results(beautified_results)
summary = summarize_results(beautified_results)
print(summary)

Search Results for 'organic chemistry':




---
**Book:** Engineering Chemistry <br>
**Author:** Agarwal, R.K.  <br>
**Publisher:** Krishna Prakashan, Meerut <br>
**Year:** 2,013 <br>
**Description:** "Engineering Chemistry" by R.K. Agarwal is a textbook that covers the fundamental principles and concepts of chemistry as applied to engineering and technology. The book provides a comprehensive overview of topics such as chemical bonding, thermodynamics, kinetics, and chemical reactions, with a focus on their practical applications in various engineering fields. It also covers topics like corrosion, extraction, and electrochemistry, making it a valuable resource for students of engineering and technology. <br>


Hello VIPS Trainees, I got these results:
Engineering Chemistry by Agarwal, R.K. 


In [22]:
!pip install -U nvidia-riva-client



In [23]:
!git clone https://github.com/nvidia-riva/python-clients.git

fatal: destination path 'python-clients' already exists and is not an empty directory.


In [24]:
import subprocess


command = [
    "python", "python-clients/scripts/tts/talk.py",
    "--server", "grpc.nvcf.nvidia.com:443", "--use-ssl",
    "--metadata", "function-id", "877104f7-e885-42b9-8de8-f6e4c6303969",
    "--metadata", "authorization", f"Bearer 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", # Enter your api key
    "--language-code", "en-US",
    "--text", summary,
    "--voice", "Magpie-Multilingual.EN-US.Aria",
    "--output", "audio.wav"
]

try:
    result = subprocess.run(command, capture_output=True, text=True, check=True)
    print("Output:", result.stdout)
except subprocess.CalledProcessError as e:
    print("Error occurred:", e.stderr)


Output: Generating audio for request...
Time spent: 3.039s



In [25]:
from IPython.display import Audio

file_path = "audio.wav"
Audio(file_path, autoplay=True)