## Presteps to Load llama3.2 On Colab

In [1]:
import tensorflow as tf
from psutil import virtual_memory

# Check GPU
gpu_info = tf.config.list_physical_devices('GPU')
print(f"GPU Info: {gpu_info}")

# Check RAM
ram_info = virtual_memory()
print(f"Total RAM: {ram_info.total / (1024**3)} GB")

2024-11-10 16:55:00.455010: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731257700.472894 2940661 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731257700.477065 2940661 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-10 16:55:00.490853: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU Info: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Total RAM: 15.489391326904297 GB


In [None]:
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
from IPython.display import clear_output

# Create a Python script to start the Ollama API server in a separate thread

import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()


In [None]:
from IPython.display import clear_output
!ollama pull llama3.2:3b  & ollama pull nomic-embed-text

## Presteps to Load llama3.2 Locally

**Hardware Requirements** <br>
**CPU**: Multicore processor<br>
**RAM**: Minimum of 16 GB recommended<br>
**GPU**: NVIDIA RTX series (for optimal performance), at least 8 GB VRAM<br>

**Step1**:<br>
Download ollama from this site according to your operating system<br>
https://ollama.com/download/linux<br>
<br>
**Step2**:<br>
open your teminal<br>
<br>
**Step3**:<br>
run following commands in your terminal<br>
\$ ollama serve<br>
\$ ollama pull llama3.2:3b  & ollama pull nomic-embed-text<br>

## Load LlaMA3.2

In [None]:
!pip install -r requirements.txt

In [223]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

MODEL = "llama3.2:3b"

# Initialize the Llama model
model = Ollama(model=MODEL)

# Create an embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")



In [None]:
print(model.invoke("Hi. Are you LlaMA, the language model?"))

## Part1 Standard RAG

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from chromadb.errors import InvalidDimensionException

#### INDEXING ####

loader = PyPDFLoader("RAG_survey.pdf")
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = text_splitter.split_documents(docs)

# Embed
## NOTE: you must run Chroma().delete_collection() before load the Chroma vectorstore 
## to delete previous loaded documents.
Chroma().delete_collection()
vectorstore = Chroma.from_documents(documents = splits, embedding=embeddings)

retriever = vectorstore.as_retriever()

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = model




### (a) Chain the Components:

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain

rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}|
    ## TODO: complete the chain here
)

In [None]:
rag_chain.invoke("what is this paper about?")

### (b) Explain TextSplitter Settings

Discussion:

### (c) Experiment with Retriever Settings

In [None]:
## TODO: Try some different settings for the retriever and output some examples
## you can cahnge the question if you want
## you can duplicate this cell to ouput different examples
retriever = vectorstore.as_retriever(...)
retrived_docs= retriever.invoke("what is this paper about?")
for doc in retrived_docs:
    print()
    print(doc)

Discussion:

## Part2 Multi-Query RAG

### (a) Prompt Template for Multi-Query:

In [130]:
from langchain.prompts import ChatPromptTemplate

## TODO: Please design a prompt template that instructs the language model to respond to questions from multiple perspectives.
template = """..... Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

In [132]:
generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
# You may generate some queries here to see if the queries diverse enough
question = "What is this paper about?"
generate_queries.invoke({"question": question})

In [137]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "What is this paper about?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})

### (b) Multi-Query RAG Chain: 

In [147]:
from operator import itemgetter
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


## TODO: Coctruct a Multi-Query RAG Chain.
# Hint1: use the retrieval_chain in this chain
# Hint2: consider the format of the prompt above and also use it in the chain 
multi_query_rag_chain = (...)


### (c) Example Comparisons:

In [None]:
## TODO:  show a standard RAG output example alongside a multi-query RAG output example.
# Hint1: You may adjust the question to highlight the advantages of multi-query RAG over standard RAG.

Discussion:

## Part3 RAG Fusion

In [197]:
## TODO: Use the same templat as Part2
template = ...
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [198]:
generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
generate_queries.invoke(question)

### (a) Implement Reciprocal Rank Fusion (RRF)

In [213]:
def reciprocal_rank_fusion(results: list[list], c):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            ## TODO:  Implement Reciprocal Rank Fusion here

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

### (b) RRF Example and k-Value Discussion: 

In [None]:
## TODO: Provide an example showing the documents after re-ranking using RRF.

Discussion:

### (c) RAG Fusion Chain:

In [None]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


## TODO: Implement the RAG Fusion chain 
# Hint1: use the retrieval_chain_rag_fusion in this chain
# Hint2: consider the format of the prompt above and also use it in the chain 
rag_fusion_chain = (...)

rag_fusion_chain.invoke({"question":question})