# RAG

## Requirements

In [None]:
%%capture
!pip install transformers accelerate bitsandbytes langchain langchain-community sentence-transformers faiss-gpu pandas gdown

## Dataset

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI/view?usp=sharing

Downloading...
From (original): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI
From (redirected): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI&confirm=t&uuid=032e3473-dc99-4ffd-9995-b51704008787
To: /content/IMDB_crawled.json
100% 292M/292M [00:07<00:00, 39.8MB/s]


## Config

In [None]:
class Config:
    EMBEDDING_MODEL_NAME="thenlper/gte-base"
    LLM_MODEL_NAME="HuggingFaceH4/zephyr-7b-beta"
    K = 5 # top K retrieval

## Preprocessing

In [None]:
import pandas as pd

df = pd.read_json('IMDB_crawled.json')

In [None]:
df.head(5)

Unnamed: 0,id,title,first_page_summary,release_year,mpaa,budget,gross_worldwide,rating,directors,writers,stars,related_links,languages,countries_of_origin,summaries,synposis,reviews,genres
0,tt0071562,The Godfather Part II,The early life and career of Vito Corleone in ...,1974,R,"$13,000,000 (estimated)","$47,962,683",9.0,[Francis Ford Coppola],,"[Al Pacino, Robert De Niro, Robert Duvall]",[https://imdb.com/title/tt0068646/?ref_=tt_sim...,"[English, Italian, Spanish, Latin, Sicilian]",[United States],[The early life and career of Vito Corleone in...,[The Godfather Part II presents two parallel s...,"[[Coppola's masterpiece is rivaled only by ""Th...","[Crime, Drama]"
1,tt0120737,The Lord of the Rings: The Fellowship of the Ring,A meek Hobbit from the Shire and eight compani...,2001,PG-13,"$93,000,000 (estimated)","$884,041,698",8.9,[Peter Jackson],,"[Elijah Wood, Ian McKellen, Orlando Bloom]",[https://imdb.com/title/tt0167261/?ref_=tt_sim...,"[English, Sindarin]","[New Zealand, United States]",[A meek Hobbit from the Shire and eight compan...,[Galadriel (Cate Blanchett) (The Elven co-rule...,"[[Here is one film that lived up to its hype, ...","[Action, Adventure, Drama]"
2,tt0110912,Pulp Fiction,"The lives of two mob hitmen, a boxer, a gangst...",1994,R,"$8,000,000 (estimated)","$213,928,762",8.9,[Quentin Tarantino],,"[John Travolta, Uma Thurman, Samuel L. Jackson]",[https://imdb.com/title/tt0137523/?ref_=tt_sim...,"[English, Spanish, French]",[United States],"[The lives of two mob hitmen, a boxer, a gangs...",[Narrative structure\nPulp Fiction's narrative...,[[I like the bit with the cheeseburger. It mak...,"[Crime, Drama]"
3,tt0068646,The Godfather,The aging patriarch of an organized crime dyna...,1972,R,"$6,000,000 (estimated)","$250,342,030",9.2,[Francis Ford Coppola],,"[Marlon Brando, Al Pacino, James Caan]",[https://imdb.com/title/tt0071562/?ref_=tt_sim...,"[English, Italian, Latin]",[United States],[The aging patriarch of an organized crime dyn...,"[In late summer 1945, guests are gathered for ...",[['The Godfather' is the pinnacle of flawless ...,"[Crime, Drama]"
4,tt0111161,The Shawshank Redemption,"Over the course of several years, two convicts...",1994,R,"$25,000,000 (estimated)","$28,904,232",9.3,[Frank Darabont],"[Stephen King, Frank Darabont]","[Tim Robbins, Morgan Freeman, Bob Gunton]",[https://imdb.com/title/tt0468569/?ref_=tt_sim...,[English],[United States],"[Over the course of several years, two convict...","[In 1947, Andy Dufresne (Tim Robbins), a banke...",[[The Shawshank Redemption is written and dire...,[Drama]


In [None]:
import os

os.makedirs('data', exist_ok=True)

columns_to_keep = ['title', 'release_year', 'genres', 'first_page_summary', 'rating', 'directors', 'stars']
df = df[columns_to_keep]

df.to_csv('data/imdb.csv', index=False)

In [None]:
df.head(5)

Unnamed: 0,title,release_year,genres,first_page_summary,rating,directors,stars
0,The Godfather Part II,1974,"[Crime, Drama]",The early life and career of Vito Corleone in ...,9.0,[Francis Ford Coppola],"[Al Pacino, Robert De Niro, Robert Duvall]"
1,The Lord of the Rings: The Fellowship of the Ring,2001,"[Action, Adventure, Drama]",A meek Hobbit from the Shire and eight compani...,8.9,[Peter Jackson],"[Elijah Wood, Ian McKellen, Orlando Bloom]"
2,Pulp Fiction,1994,"[Crime, Drama]","The lives of two mob hitmen, a boxer, a gangst...",8.9,[Quentin Tarantino],"[John Travolta, Uma Thurman, Samuel L. Jackson]"
3,The Godfather,1972,"[Crime, Drama]",The aging patriarch of an organized crime dyna...,9.2,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan]"
4,The Shawshank Redemption,1994,[Drama],"Over the course of several years, two convicts...",9.3,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton]"


## Vectorizer

load the CSV file and vectorize the rows using HuggingFaceEmbeddings.
Store the results using FAISS vectorstore.
Save the vectorestore in a pickle file for future usages.

In [14]:
import pickle
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load the CSV file
csv_path = 'data/imdb.csv'
loader = CSVLoader(csv_path)
documents = loader.load()

model_name = Config.EMBEDDING_MODEL_NAME
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


vectorstore = FAISS.from_documents(documents, hf, distance_strategy=DistanceStrategy.COSINE)

# with open("data/vectorstore.pkl", "wb") as f:
#     pickle.dump(vectorstore, f)

load the vectorstore as a retriever.

In [16]:
# with open("data/vectorstore.pkl", "rb") as f:
#     vectorstore = pickle.load(f)

# load the retriever from the vectorstore
K = Config.K
retriever = vectorstore.as_retriever(k=K)

## LLM

load the quantized LLM.

In [34]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# load the quantization config
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True)

model = AutoModelForCausalLM.from_pretrained(Config.LLM_MODEL_NAME, quantization_config=bnb_config, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL_NAME)

# init the pipeline
READER_LLM = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)

llm = HuggingFacePipeline(
    pipeline=READER_LLM,
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

initialize the prompt template for the query chain. query chain is used to get a query from the chat history. you may change the prompt as you like to get better results.

In [61]:
from langchain.prompts import PromptTemplate

from langchain_core.output_parsers import StrOutputParser

class LoggerStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        # process the LLM output
        print(f"QUERY: {text}")
        return text

query_transform_prompt = PromptTemplate(
    input_variables=["messages"],
    template="""<|system|>You are a helpful assistant.
{messages}
<|user|>
give me the search query about the above conversation.
<|assistant|>"""
)

# init the query chain
query_transforming_retriever_chain = query_transform_prompt|llm|LoggerStrOutputParser()|retriever


initialize the main retrieval chain that gives the resulting documents to LLM and gets the output back.

In [62]:
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain_core.runnables import RunnablePassthrough

prompt = PromptTemplate(
    input_variables=["context", "messages"],
    template="""<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

{context}
-----------------
{messages}
<|assistant|>""")

# init the retriver chain

document_combining_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = (
    RunnablePassthrough
    .assign(context=query_transforming_retriever_chain)
    .assign(answer=document_combining_chain)
)

write the conversation helper class for easier testing.

In [63]:
class Conversation:
    def __init__(self):
        self.messages = []

    def add_assistant_message(self, message):
        self.messages.append(('assistant', message))

    def add_user_message(self, message):
        self.messages.append(('user', message))

    def get_messages(self):
        formatted_messages = "\n".join([f"{role.capitalize()}: {msg}" for role, msg in self.messages])
        return formatted_messages

    def chat(self, message):
        self.add_user_message(message)
        messages = self.get_messages()
        response = retrieval_chain.invoke({'messages': messages})['answer']
        self.add_assistant_message(response)
        return response

## Test

talk with the RAG to see how good it performs.

In [64]:
c = Conversation()
A = c.chat('give me a cool gangster movie')
print(A)

QUERY: <|system|>You are a helpful assistant.
User: give me a cool gangster movie
<|user|>
give me the search query about the above conversation.
<|assistant|>
"recommend a cool gangster movie based on a conversation between a user and an assistant discussing the topic"
<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

title: Q&A
release_year: 1990
genres: ['Crime', 'Drama', 'Thriller']
first_page_summary: Dirty cop, Mike Brennan thinks he got away with murder. But during a routine Q&A, the righteous assistant DA finds a clue that sets them both on a collision course.
rating: 6.6
directors: ['Sidney Lumet']
stars: ['Nick Nolte', 'Timothy Hutton', 'Armand Assante']

title: The Conversation
release_year: 1974
genres: ['Drama', 'Mystery', 'Thriller']
first_page_summary: A paranoid, secretive surveillance expert has a crisis of conscience when he suspects that the couple he is spying on will be murdered.
rating: 7.7
directors: ['Francis Ford Coppola']
stars

In [56]:
A = c.chat('give me a newer one')
print(A)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


QUERY: <|system|>You are a helpful assistant.
User: give me a cool gangster movie
Assistant: <|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

title: Q&A
release_year: 1990
genres: ['Crime', 'Drama', 'Thriller']
first_page_summary: Dirty cop, Mike Brennan thinks he got away with murder. But during a routine Q&A, the righteous assistant DA finds a clue that sets them both on a collision course.
rating: 6.6
directors: ['Sidney Lumet']
stars: ['Nick Nolte', 'Timothy Hutton', 'Armand Assante']

title: The Conversation
release_year: 1974
genres: ['Drama', 'Mystery', 'Thriller']
first_page_summary: A paranoid, secretive surveillance expert has a crisis of conscience when he suspects that the couple he is spying on will be murdered.
rating: 7.7
directors: ['Francis Ford Coppola']
stars: ['Gene Hackman', 'John Cazale', 'Allen Garfield']

title: American Gangster
release_year: 2006–2009
genres: ['Documentary']
first_page_summary: Follows the lives of American ga