# My RAG project

## pip install

In [1]:
#%pip install langchain
#%pip install huggingface_hub
#%pip install sentence_transformers
#%pip install sentence-transformers
#%pip install faiss-cpu
#%pip install unstructured
#%pip install chromadb
#%pip install Cython
#%pip install tiktoken
#%pip install unstructured[local-inference]

## Alice in wonderland


In this first part, we will test our code with a .txt document : "alice_in_wonderland.txt"

In [None]:
import os
#HUGGINFACE_API_TOKEN

In [3]:
import requests
url = "https://gist.githubusercontent.com/phillipj/4944029/raw/75ba2243dd5ec2875f629bf5d79f6c1e4b5a8b46/alice_in_wonderland.txt"
res = requests.get(url)
with open("alice_in_wonderland.txt", "w") as f:
  f.write(res.text)

## Document Loader

In [4]:
# Document Loader
from langchain.document_loaders import TextLoader
loader = TextLoader('./alice_in_wonderland.txt')
documents = loader.load()

In [5]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

print(wrap_text_preserve_newlines(str(documents[0])))

page_content='Alice\'s Adventures in Wonderland\n\n                ALICE\'S ADVENTURES IN WONDERLAND\n\n
Lewis Carroll\n\n               THE MILLENNIUM FULCRUM EDITION 3.0\n\n\n\n\n
CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n  Alice was beginning to get very tired of
sitting by her sister\non the bank, and of having nothing to do:  once or twice she had\npeeped into the book
her sister was reading, but it had no\npictures or conversations in it, `and what is the use of a
book,\'\nthought Alice `without pictures or conversation?\'\n\n  So she was considering in her own mind (as
well as she could,\nfor the hot day made her feel very sleepy and stupid), whether\nthe pleasure of making a
daisy-chain would be worth the trouble\nof getting up and picking the daisies, when suddenly a White\nRabbit
with pink eyes ran close by her.\n\n  There was nothing so VERY remarkable in that; nor did Alice\nthink it so
VERY much out of the way to hear the Rabbit say to\nitself, `Oh dear!

## Text splitter

In [6]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1300,chunk_overlap=0)
docs = text_splitter.split_documents(documents)

## Embeddings

In [7]:
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


## Vectorstore

In [8]:
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)

## QA chain and LLM

In [44]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

llm1=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0, "max_length":512})
llm2=HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature":0.7, "max_length":512})
llm3=HuggingFaceHub(repo_id="google/gemma-7b-it", model_kwargs={"temperature":0.7, "max_length":512})

In [45]:
#google/flan-t5-large
chain1 = load_qa_chain(llm1, chain_type="stuff")
#mistralai/Mistral-7B-Instruct-v0.1
chain2 = load_qa_chain(llm2, chain_type="stuff")
#google/gemma-7b-it
chain3 = load_qa_chain(llm3, chain_type="stuff")

In [42]:
query = "What animal does Alice encounter right at the beginning of her adventure in Wonderland?"
docs = db.similarity_search(query)
print("google/flan-t5-large")
chain1.run(input_documents=docs, question=query)


google/flan-t5-small


'White Rabbit'

In [52]:
print("mistralai/Mistral-7B-Instruct-v0.1")
chain2.run(input_documents=docs, question=query).split("Helpful Answer: ")[1].split("\n\n")[0]

mistralai/Mistral-7B-Instruct-v0.1


'Alice encounters a White Rabbit with pink eyes at the beginning of her adventure in Wonderland.'

In [55]:
print("google/gemma-7b-it")
chain3.run(input_documents=docs, question=query).split("Helpful Answer: ")[1].split("\n\n")[0]

google/gemma-7b-it


'I do not know. I have not been able to understand the text well enough to answer this question.'

In [53]:
query = "What prompts Alice to follow the White Rabbit down the rabbit-hole?"
docs = db.similarity_search(query)
print("google/flan-t5-large")
chain1.run(input_documents=docs, question=query)

google/flan-t5-large


'A watch'

In [56]:
print("mistralai/Mistral-7B-Instruct-v0.1")
chain2.run(input_documents=docs, question=query).split("Helpful Answer: ")[1].split("\n\n")[0]

mistralai/Mistral-7B-Instruct-v0.1


"Alice is prompted to follow the White Rabbit down the rabbit-hole because she is curious about the rabbit's behavior, particularly its habit of checking its watch and hurrying. When the rabbit suddenly disappears into a rabbit-hole, Alice's curiosity is further piqued and she decides to follow it down."

In [57]:
print("google/gemma-7b-it")
chain3.run(input_documents=docs, question=query).split("Helpful Answer: ")[1].split("\n\n")[0]

google/gemma-7b-it


"I do not know. I have not been able to understand the text well enough to answer this question.'\n\n"

# PDF files

Now, will make our code works with several PDFs files. The Good_luck one and a a small database of PDF files

In [17]:
import os

# Chemin vers le dossier local contenant les fichiers PDF
pdf_folder_path_GL = 'C:/Users/guill/OneDrive/Documents/ENSEA/2eme année/SEMESTRE 8/Option/RAG_2/PDF_GOOD_LUCK'

# Liste des fichiers et dossiers dans le dossier PDF_DATABASE
pdf_files_GL = os.listdir(pdf_folder_path_GL)

os.listdir(pdf_folder_path_GL)


['GoodLuck.pdf']

## Loaders

In [19]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

Good luck

In [20]:
loaders_GL = [UnstructuredPDFLoader(os.path.join(pdf_folder_path_GL, fn)) for fn in os.listdir(pdf_folder_path_GL)]
loaders_GL

[<langchain_community.document_loaders.pdf.UnstructuredPDFLoader at 0x2583e7caa50>]

## Vector Store

In [22]:
index_GL = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1300, chunk_overlap=0)).from_loaders(loaders_GL)

## The 3 LLM

In [24]:
llm1=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0, "max_length":512})

In [33]:
llm2=HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature":0.7, "max_length":512})

In [26]:
llm3=HuggingFaceHub(repo_id="google/gemma-7b-it", model_kwargs={"temperature":0.7, "max_length":512})

## RetrievalQA

Good Luck

In [34]:
from langchain.chains import RetrievalQA
#google/flan-t5-small
chain1_GL = RetrievalQA.from_chain_type(llm=llm1, 
                                    chain_type="stuff", 
                                    retriever=index_GL.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

#google/flan-t5-large
chain2_GL = RetrievalQA.from_chain_type(llm=llm2, 
                                    chain_type="stuff", 
                                    retriever=index_GL.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

#google/gemma-7b-it
chain3_GL = RetrievalQA.from_chain_type(llm=llm3, 
                                    chain_type="stuff", 
                                    retriever=index_GL.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

Flan-t5-small

In [None]:
chain1_GL('What the text is about?')

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


{'question': 'What the text is about?',
 'result': '                                                                                                                                                                                                                                                               ',
 'source_documents': [Document(page_content='இதயம் என் பது இஷ் காவில் உள்ள அம்பு\n\nइश्क़ में दिल फ़ना है, oh-oh-oh\n\nநீ க்கவும் அல்லது உருவாக்கவும்\n\nमैंने तुझको चुना है, oh-oh-oh\n\nكبولسأ\n\nيدتريو\n\n،كناولأ\n\nلك\n\nيدتري\n\nतेरा हुआ मैं सब को छोड़ के , oh-oh-oh\n\n"కొలవడం ద్వా రా, రహస్యా లను బహిర గతం చేయడం ద్వా రా ప్రేమలో పడకండి."\n\nआया हूँ मैं सब को बोल के , oh-oh-oh\n\nÔi em đã đến bên anh, sức lực của anh đã cạn kiệt\n\nਤੇਰਾ ਹੋਇਆ ਮੈਂ, ਯਾਰ ਵੇ, ਭ\n\nੁੱਲਿਆ ਏ ਸੰਸਾਰ ਵੇ\n\nΤο κουπί σου έφυγε, η δύναμή σου έφυγε\n\nΈγινα δικός σου, φίλε μου, ξέχασα αυτόν τον κόσμο\n\nΈφυγα από τον κόσμο για σένα, ένωσα την καρδιά μου μαζί σου\n\nअब तेरा मैं तो हो गया, पाके तुझे मैं खो गया\n\nന

Flan-t5-large

In [38]:
# Récupération de la sortie de la fonction chain
output = chain2_GL('What the text is about?')
#output = chain2_GL('in what context do you think "இதயம் என் பது இஷ் காவில் உள்ள அம்பு" is said ?')

# Extraction des métadonnées de la source et stockage dans un ensemble pour éliminer les doublons
unique_sources = set(doc.metadata['source'] for doc in output['source_documents'])

# Affichage des métadonnées de la source et réintégration des parties 'question' et 'result'

print("'QUESTION': '" + output['question'] + "'")
print("'RESULT': '" + output['result'].split("Helpful Answer: ")[1].split("\n\n")[0] + "'")
n=1
for source in unique_sources:    
    print("'SOURCE " + str(n) + "': '" + source + "'")
    n+=1


Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


'QUESTION': 'What the text is about?'
'RESULT': 'The text is about the story of a boy who loves a girl and the various emotions he experiences as he falls in love with her. The story is told through songs and various poems.'
'SOURCE 1': 'C:/Users/guill/OneDrive/Documents/ENSEA/2eme année/SEMESTRE 8/Option/RAG_2/PDF_GOOD_LUCK\GoodLuck.pdf'


Gemma-7b-it

In [58]:
# Récupération de la sortie de la fonction chain
output = chain3_GL('What the text is about?')
#output = chain3_GL('in what context do you think "இதயம் என் பது இஷ் காவில் உள்ள அம்பு" is said ?')

# Extraction des métadonnées de la source et stockage dans un ensemble pour éliminer les doublons
unique_sources = set(doc.metadata['source'] for doc in output['source_documents'])

# Affichage des métadonnées de la source et réintégration des parties 'question' et 'result'

print("'QUESTION': '" + output['question'] + "'")
print("'RESULT': '" + output['result'].split("Helpful Answer: ")[1].split("\n\n")[0] + "'")
n=1
for source in unique_sources:    
    print("'SOURCE " + str(n) + "': '" + source + "'")
    n+=1


Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


'QUESTION': 'What the text is about?'
'RESULT': 'I don't know. I haven't enough information about the text to understand its content.'
'SOURCE 1': 'C:/Users/guill/OneDrive/Documents/ENSEA/2eme année/SEMESTRE 8/Option/RAG_2/PDF_GOOD_LUCK\GoodLuck.pdf'


## Test on my own database

In [None]:
import os

# Chemin vers le dossier local contenant les fichiers PDF
pdf_folder_path_DB = 'C:/Users/guill/OneDrive/Documents/ENSEA/2eme année/SEMESTRE 8/Option/RAG_2/PDF_DATABASE'

# Liste des fichiers et dossiers dans le dossier PDF_DATABASE
pdf_files_DB = os.listdir(pdf_folder_path_DB)

os.listdir(pdf_folder_path_DB)

## Loaders

In [None]:
loaders_DB = [UnstructuredPDFLoader(os.path.join(pdf_folder_path_DB, fn)) for fn in os.listdir(pdf_folder_path_DB)]
loaders_DB

## Vector Store

In [None]:
index_DB = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1300, chunk_overlap=0)).from_loaders(loaders_DB)

## LLM

In [None]:
llm1=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0, "max_length":512})

In [None]:
llm2=HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature":0.7, "max_length":512})

In [None]:
llm3=HuggingFaceHub(repo_id="google/gemma-7b-it", model_kwargs={"temperature":0.7, "max_length":512})

## RetrievalQA

In [None]:
from langchain.chains import RetrievalQA
#google/flan-t5-small
chain1_DB = RetrievalQA.from_chain_type(llm=llm1, 
                                    chain_type="stuff", 
                                    retriever=index_DB.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

#google/flan-t5-large
chain2_DB = RetrievalQA.from_chain_type(llm=llm2, 
                                    chain_type="stuff", 
                                    retriever=index_DB.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

#google/gemma-7b-it
chain3_DB = RetrievalQA.from_chain_type(llm=llm3, 
                                    chain_type="stuff", 
                                    retriever=index_DB.vectorstore.as_retriever(), 
                                    input_key="question",
                                    return_source_documents=True)

In [None]:
chain1_DB('What is the telegrapher~s equation')

In [None]:
# Récupération de la sortie de la fonction chain
output = chain2_DB('What is RBRM?')

# Extraction des métadonnées de la source et stockage dans un ensemble pour éliminer les doublons
unique_sources = set(doc.metadata['source'] for doc in output['source_documents'])

# Affichage des métadonnées de la source et réintégration des parties 'question' et 'result'

print("'QUESTION': '" + output['question'] + "'")
print("'RESULT': '" + output['result'].split("Helpful Answer: ")[1].split("\n\n")[0] + "'")
n=1
for source in unique_sources:    
    print("'SOURCE " + str(n) + "': '" + source + "'")
    n+=1


In [None]:
# Récupération de la sortie de la fonction chain
output = chain2_DB('What is RBRM?')

# Extraction des métadonnées de la source et stockage dans un ensemble pour éliminer les doublons
unique_sources = set(doc.metadata['source'] for doc in output['source_documents'])

# Affichage des métadonnées de la source et réintégration des parties 'question' et 'result'

print("'QUESTION': '" + output['question'] + "'")
print("'RESULT': '" + output['result'].split("Helpful Answer: ")[1].split("\n\n")[0] + "'")
n=1
for source in unique_sources:    
    print("'SOURCE " + str(n) + "': '" + source + "'")
    n+=1


In [None]:
from gradio import Interface

def chat_with_model(question):
    # Appel de la fonction chain avec la question donnée
    output = chain3_DB(question)
    # Extraction des métadonnées de la source et réintégration des parties 'question' et 'result'
    response = ""
    response += output['result'].split("Helpful Answer: ")[1].split("\n\n")[0] + "'\n\n"
    n=1
    for source in set(doc.metadata['source'] for doc in output['source_documents']):
        response += "'SOURCE " + str(n) + "': '" + source + "'}\n"
        n+=1
    #response += "'question': '" + output['question'] + "'\n"
    return response

iface = Interface(fn=chat_with_model,
                  inputs="text", 
                  outputs="text",
                  title="My PDF RAG",
                  theme="soft",
                  description="Ask a question about your PDF documents")
iface.launch(debug=True,
             share=True)#share=True