# RAG

## Requirements

In [1]:
%%capture
!pip install transformers accelerate bitsandbytes langchain langchain-community sentence-transformers faiss-gpu pandas gdown

## Dataset

In [2]:
!gdown --fuzzy https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI&confirm=t&uuid=3a67c0cf-5e02-485b-a6c0-9b92bb41d894

Downloading...
From (original): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI
From (redirected): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI&confirm=t&uuid=2ba91cf5-f4e4-45a7-bbb1-7387db406523
To: /content/IMDB_crawled.json
100% 292M/292M [00:03<00:00, 89.5MB/s]


## Config

In [3]:
class Config:
    EMBEDDING_MODEL_NAME="thenlper/gte-base"
    LLM_MODEL_NAME="HuggingFaceH4/zephyr-7b-beta"
    K = 5 # top K retrieval

## Preprocessing

In [4]:
!pip install nltk==3.8.1
import pandas as pd
import nltk
nltk.download('stopwords')


df = pd.read_json('IMDB_crawled.json')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
import os

os.makedirs('data', exist_ok=True)

# preprocess your data and only store the needed data as the context window for embedding model is limited


import re
from nltk.corpus import stopwords

os.makedirs('data', exist_ok=True)

df = df[['id', 'title','first_page_summary', 'genres']]

#stop_words = set(stopwords.words('english'))

#def preprocess(text):
#    text =  (str(text).lower())

#    words = text.split()
#    filtered_words = [word for word in words if word not in stop_words]
#    return ' '.join(filtered_words)


#df = df.applymap(preprocess)


df.to_csv('data/imdb.csv', index=False)

## Vectorizer

load the CSV file and vectorize the rows using HuggingFaceEmbeddings.
Store the results using FAISS vectorstore.
Save the vectorestore in a pickle file for future usages.

In [6]:
import pickle
import numpy as np
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.faiss import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings

# load the csv
csv_loader = CSVLoader(file_path="data/imdb.csv",encoding="utf-8")
documents = csv_loader.load()



# load the embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name="thenlper/gte-base",encode_kwargs={"normalize_embeddings": True})


# save embed the documents using the model in a vectorstore
vectorstore = FAISS.from_documents(documents, embeddings_model)


with open("data/vectorstore.pkl", "wb") as f:
     pickle.dump(vectorstore, f)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

load the vectorstore as a retriever.

In [7]:
with open("data/vectorstore.pkl", "rb") as f:


    vectorstore = pickle.load(f)
retriever =  vectorstore.as_retriever(top_k=5)

## LLM

load the quantized LLM.

In [8]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline


# load the quantization config
bnb_config = BitsAndBytesConfig()

model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", quantization_config=bnb_config, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")

# init the pipeline
READER_LLM = pipeline("text-generation", model=model, tokenizer=tokenizer,max_length=2000)

llm = HuggingFacePipeline(
    pipeline=READER_LLM,
)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

  warn_deprecated(


initialize the prompt template for the query chain. query chain is used to get a query from the chat history. you may change the prompt as you like to get better results.

In [9]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import LLMChain

class LoggerStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        # process the LLM output
        print(f"QUERY: {text}")
        return text

query_transform_prompt = PromptTemplate(
    input_variables=["messages"],
    template="""<|system|>You are a helpful assistant.
{messages}
<|user|>
give me the search query about the above conversation.
<|assistant|>"""
)

# init the query chain
query_transforming_retriever_chain = LLMChain(
    prompt=query_transform_prompt,
    output_parser=LoggerStrOutputParser(),
    llm =llm
)

  warn_deprecated(


initialize the main retrieval chain that gives the resulting documents to LLM and gets the output back.

In [10]:
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain_core.runnables import RunnablePassthrough

prompt = PromptTemplate(
    input_variables=["context", "messages"],
    template="""<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

{context}
-----------------
{messages}
<|assistant|>""")

# init the retriver chain


context_chain = (
    {"context": retriever, "messages": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


write the conversation helper class for easier testing.

In [11]:
class Conversation:
    def __init__(self):
        self.messages = []

    def add_assistant_message(self, message):
        self.messages.append(('assistant', message))

    def add_user_message(self, message):
        self.messages.append(('user', message))

    def get_messages(self):

        # concatenate the messages with the roles in the instruction format

        lines = []

        for role, msg in self.messages:
            lines.append(f"{role}: {msg}")

        return "\n".join(lines)


    def chat(self, message):
        self.add_user_message(message)
        messages = self.get_messages()
        # invoke the chain
        response = context_chain.invoke(messages)
        self.add_assistant_message(response)
        return response

## Test

talk with the RAG to see how good it performs.

In [12]:
c = Conversation()
A = c.chat('give me a cool gangster movie')
print(A)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

[Document(page_content="id: tt0765429\ntitle: American Gangster\nfirst_page_summary: An outcast New York City cop is charged with bringing down Harlem drug lord Frank Lucas, whose real life inspired this partly biographical film.\ngenres: ['Biography', 'Crime', 'Drama']", metadata={'source': 'data/imdb.csv', 'row': 2433}), Document(page_content='id: tt0102603\ntitle: Oscar\nfirst_page_summary: A gangster attempts to keep the promise he made to his dying father: that he would give up his life of crime and "go straight".\ngenres: [\'Comedy\', \'Crime\']', metadata={'source': 'data/imdb.csv', 'row': 9856}), Document(page_content='id: tt1166168\ntitle: Al imbrator\nfirst_page_summary: A remake of "scarface",tells the turbulant world of drug dealing,damaged personal relationships,and the rise and the fall of a drug lord.\ngenres: [\'Crime\', \'Drama\']', metadata={'source': 'data/imdb.csv', 'row': 4197}), Doc

In [13]:
A = c.chat('give me a newer one')
print(A)

<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

[Document(page_content="id: tt0765429\ntitle: American Gangster\nfirst_page_summary: An outcast New York City cop is charged with bringing down Harlem drug lord Frank Lucas, whose real life inspired this partly biographical film.\ngenres: ['Biography', 'Crime', 'Drama']", metadata={'source': 'data/imdb.csv', 'row': 2433}), Document(page_content="id: tt0026393\ntitle: 'G' Men\nfirst_page_summary: James Cagney helped jump-start the gangster genre as The Public Enemy. Outcries against movies that glorified underworld criminals put Cagney on the side of the law in 'G' Men.\ngenres: ['Crime', 'Drama', 'Film-Noir']", metadata={'source': 'data/imdb.csv', 'row': 4203}), Document(page_content='id: tt1166168\ntitle: Al imbrator\nfirst_page_summary: A remake of "scarface",tells the turbulant world of drug dealing,damaged personal relationships,and the rise and the fall of a drug lord.\ngenres: [\'Crime\', \'Drama\'