# RAGnificent
A Magnificent RAG for the IBM Specialization "Generative AI Engineering with LLMs" final project

In [None]:
from pathlib import Path
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredMarkdownLoader,
    JSONLoader,
    WebBaseLoader,
    TextLoader
)
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Used:
facebook_chat_json_path = Path("documents") / "json" / "facebook_chat.json"
markdown_sample_path = 'documents\markdown\markdown-sample.md'
lora_paper_pdf_path = 'documents\pdf\LoRA_paper.pdf'
langchain_url = 'https://www.ibm.com/topics/langchain'
new_policies_txt_path = Path("documents") / "txt" / "new_policies.txt"

# Unused:
# mlb_teams_csv_path = 'documents\csv\mlb_teams_2012.csv'
# large_scale_alignment_pdf_path = 'documents\pdf\large_scale_alignment.pdf'

llm_model_id = 'mistralai/mixtral-8x7b-instruct-v01'
embedding_model_id = 'sentence-transformers/all-mpnet-base-v2'

## Task 1 - Load document using LangChain for different sources

### PDF

In [3]:
pdf_loader = PyMuPDFLoader(lora_paper_pdf_path)
pdf_data = pdf_loader.load()
pdf_text_content = '\n'.join([page.page_content for page in pdf_data])
print(pdf_text_content)

A Comprehensive Review of Low-Rank
Adaptation in Large Language Models for
Efficient Parameter Tuning
September 10, 2024
Abstract
Natural Language Processing (NLP) often involves pre-training large
models on extensive datasets and then adapting them for specific tasks
through fine-tuning. However, as these models grow larger, like GPT-3
with 175 billion parameters, fully fine-tuning them becomes computa-
tionally expensive. We propose a novel method called LoRA (Low-Rank
Adaptation) that significantly reduces the overhead by freezing the orig-
inal model weights and only training small rank decomposition matrices.
This leads to up to 10,000 times fewer trainable parameters and reduces
GPU memory usage by three times. LoRA not only maintains but some-
times surpasses fine-tuning performance on models like RoBERTa, De-
BERTa, GPT-2, and GPT-3.
Unlike other methods, LoRA introduces
no extra latency during inference, making it more efficient for practical
applications.
All relevant code an

### Markdown

In [None]:
md_loader = UnstructuredMarkdownLoader(markdown_sample_path)
md_data = md_loader.load()
# print(md_data[0])

### JSON

In [None]:
json_loader = JSONLoader(
    file_path=facebook_chat_json_path,
    jq_schema='.messages[].content',
    text_content=False)

json_data = json_loader.load()
# print(json_data[0])

### Web

In [None]:
web_loader = WebBaseLoader(langchain_url)
web_data = web_loader.load()
# print(web_data[0])

### Text

In [None]:
txt_loader = TextLoader(new_policies_txt_path)
txt_data = txt_loader.load()
# print(txt_data[0])

## Task 2 - Apply text splitting techniques

### Recursive Character Text Splitter - On PDF file content

In [4]:
rc_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
)
chunks = rc_text_splitter.split_text(pdf_text_content)
print(f"Number of chunks created from PDF: {len(chunks)}")
print(f"First two chunks' content:\n{chunks[:2]}")

Number of chunks created from PDF: 73
First two chunks' content:
['A Comprehensive Review of Low-Rank\nAdaptation in Large Language Models for\nEfficient Parameter Tuning\nSeptember 10, 2024\nAbstract\nNatural Language Processing (NLP) often involves pre-training large\nmodels on extensive datasets and then adapting them for specific tasks\nthrough fine-tuning. However, as these models grow larger, like GPT-3', 'with 175 billion parameters, fully fine-tuning them becomes computa-\ntionally expensive. We propose a novel method called LoRA (Low-Rank\nAdaptation) that significantly reduces the overhead by freezing the orig-\ninal model weights and only training small rank decomposition matrices.\nThis leads to up to 10,000 times fewer trainable parameters and reduces']


### Code Splitter on Python code snippet

In [5]:
PYTHON_CODE = """
    def hello_world():
        print("Hello, World!")
    
    # Call the function
    hello_world()
"""

In [None]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

## Task 3 - Embed documents

In [6]:
huggingface_embedding = HuggingFaceEmbeddings(model_name=embedding_model_id)

  huggingface_embedding = HuggingFaceEmbeddings(model_name=embedding_model_id)
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
chunks_embeddings = huggingface_embedding.embed_documents(chunks)
print(f"First 5 embeddings for the chunks:\n{chunks_embeddings[:5]}")

First 5 embeddings for the chunks:
[[0.035864997655153275, 0.05806204676628113, -0.019342534244060516, 0.028478598222136497, 0.010876252315938473, 0.03260980173945427, -0.01797351986169815, 0.028705885633826256, -0.007086438592523336, -0.0628940761089325, -0.025137027725577354, -0.010378682985901833, -0.012204059399664402, 0.003963047172874212, 0.033678196370601654, -0.05547269061207771, 0.07979834079742432, -0.021819014102220535, -0.004911839962005615, -0.01717427186667919, -0.031394582241773605, 0.007547338958829641, -0.0020945644937455654, 0.018018919974565506, 0.057778891175985336, 0.0013638543896377087, 0.0023492504842579365, 0.0010475687449797988, 0.001231522997841239, 0.009324378333985806, -0.006343798246234655, 0.06647956371307373, 0.030796635895967484, 0.052936799824237823, 1.7924422763826442e-06, -0.04325321316719055, -0.04193982109427452, -0.009559628553688526, -0.014006862416863441, 0.021783683449029922, 0.020374689251184464, -0.01465480774641037, -0.024416940286755562, 0.0

## Task 4 - Create and configure vector databases to store embeddings

In [8]:
ids = [str(i) for i in range(0, len(chunks))]
vectordb = Chroma.from_texts(chunks, huggingface_embedding, ids=ids)

## Task 5 - Develop a retriever to fetch document segments based on queries

In [9]:
query = "What is this paper talking about?"
retriever = vectordb.as_retriever()
docs = retriever.invoke(query)
print(f"Number of documents retrieved: {len(docs)}")
docs

Number of documents retrieved: 4


[Document(metadata={}, page_content='isting weight matrices. As detailed in Section 4.2, LoRA is applied to the query\nand value matrices in most experiments. The number of trainable parameters\nis determined by the rank r and the shape of the original weight matrices:\n|Θ| = 2 × LLoRA × dmodel × r, where LLoRA represents the number of weight\nmatrices to which LoRA is applied.'),
 Document(metadata={}, page_content='descent. A variant of this is fine-tuning only select layers, while freezing the\nrest. One such baseline from prior work on GPT-2 updates only the last two\nlayers (denoted as FTTop2).\nBitFit is another baseline in which only the bias parameters are updated,\nwhile all other parameters remain frozen. This method has gained attention,\nincluding in recent studies [?].'),
 Document(metadata={}, page_content='to SQL (NL2SQL). Each downstream task is represented as a training set of\ncontext-output pairs:\nZ = {(xi, yi)}i=1,...,N,\nwhere both xi and yi are sequences of token

## Task 6 - Construct a QA Bot that leverages the LangChain and LLM to answer questions

In [None]:
def get_llm(model_id: str = "tiiuae/falcon-7b-instruct",  # puedes cambiar por otro
            max_new_tokens: int = 512,
            temperature: float = 0.7,
            device: int = 0  # -1 para CPU, 0 para primera GPU
           ):
    # Cargar tokenizer y modelo
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

    # Crear pipeline de generación de texto
    hf_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        return_full_text=False
    )

    # Integrar con LangChain
    llm = HuggingFacePipeline(pipeline=hf_pipeline)

    return llm


In [None]:
query = "How are you?"

query_result = huggingface_embedding.embed_query(query)
query_result[:5]