# RAGnificent
A Magnificent RAG for the IBM Specialization "Generative AI Engineering with LLMs" final project

In [None]:
from pathlib import Path
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredMarkdownLoader,
    JSONLoader,
    WebBaseLoader,
    TextLoader
)
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [None]:
# Used:
facebook_chat_json_path = Path("documents") / "json" / "facebook_chat.json"
markdown_sample_path = 'documents\markdown\markdown-sample.md'
lora_paper_pdf_path = 'documents\pdf\LoRA_paper.pdf'
langchain_url = 'https://www.ibm.com/topics/langchain'
new_policies_txt_path = Path("documents") / "txt" / "new_policies.txt"

# Unused:
# mlb_teams_csv_path = 'documents\csv\mlb_teams_2012.csv'
# large_scale_alignment_pdf_path = 'documents\pdf\large_scale_alignment.pdf'

llm_model_id = 'mistralai/mixtral-8x7b-instruct-v01'
embedding_model_id = 'sentence-transformers/all-mpnet-base-v2'

## Task 1 - Load document using LangChain for different sources

### PDF

In [None]:
pdf_loader = PyMuPDFLoader(lora_paper_pdf_path)
pdf_data = pdf_loader.load()
print(pdf_data[0])

### Markdown

In [None]:
md_loader = UnstructuredMarkdownLoader(markdown_sample_path)
md_data = md_loader.load()
# print(md_data[0])

### JSON

In [None]:
json_loader = JSONLoader(
    file_path=facebook_chat_json_path,
    jq_schema='.messages[].content',
    text_content=False)

json_data = json_loader.load()
# print(json_data[0])

### Web

In [None]:
web_loader = WebBaseLoader(langchain_url)
web_data = web_loader.load()
# print(web_data[0])

### Text

In [None]:
txt_loader = TextLoader(new_policies_txt_path)
txt_data = txt_loader.load()
# print(txt_data[0])

## Task 2 - Apply text splitting techniques

### Recursive Character Text Splitter - On PDF file content

In [None]:
rc_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
chunks = rc_text_splitter.create_documents([page.page_content for page in pdf_data])
chunks_content = [chunk.page_content for chunk in chunks]
print(f"Number of chunks created from PDF: {len(chunks)}")
print(f"First two chunks' content:\n{chunks_content[:2]}")

### Code Splitter on Python code snippet

In [None]:
PYTHON_CODE = """
    def hello_world():
        print("Hello, World!")
    
    # Call the function
    hello_world()
"""

In [None]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

## Task 3 - Embed documents

In [None]:
huggingface_embedding = HuggingFaceEmbeddings(model_name=embedding_model_id)

In [None]:
chunks_embeddings = huggingface_embedding.embed_documents(chunks)
print(f"First 5 embeddings for the chunks:\n{chunks_embeddings[:5]}")

## Task 4 - Create and configure vector databases to store embeddings

In [None]:
ids = [str(i) for i in range(0, len(chunks))]
vectordb = Chroma.from_documents(chunks, huggingface_embedding, ids=ids)

## Task 5 - Develop a retriever to fetch document segments based on queries

## Task 6 - Construct a QA Bot that leverages the LangChain and LLM to answer questions

In [None]:
query = "How are you?"

query_result = huggingface_embedding.embed_query(query)
query_result[:5]