# RAG using Langchain

## Packages loading & import

In [None]:
!pip install langchain
!pip install langchain_community
!pip install langchain_huggingface
!pip install langchain_text_splitters
!pip install langchain_chroma
!pip install rank-bm25
!pip install huggingface_hub

In [None]:
import os
import json
import bs4
import nltk
import torch
import pickle
import numpy as np

# from pyserini.index import IndexWriter
# from pyserini.search import SimpleSearcher
from numpy.linalg import norm
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

from langchain_community.llms import Ollama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import JinaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

## Hugging face login
- Please apply the model first: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
- If you haven't been granted access to this model, you can use other LLM model that doesn't have to apply.
- You must save the hf token otherwise you need to regenrate the token everytime.
- When using Ollama, no login is required to access and utilize the llama model.

In [None]:
from huggingface_hub import login

hf_token = "hf_***" # @param{type:“string”}
login(token=hf_token, add_to_git_credential=True)

In [None]:
!huggingface-cli whoami

## Ollam enviroment setting

- Ollama is a platform designed for running and managing large language models (LLMs) directly **on local devices**, providing a balance between performance, privacy, and control.
- There are also other tools support users to manage LLM on local devices and accelerate it like *vllm*, *Llamafile*, *GPT4ALL*...etc.

In [None]:
!pip install colab-xterm #https://pypi.org/project/colab-xterm/
%load_ext colabxterm


- Running the below instructions:

  ```curl -fsSL https://ollama.com/install.sh | sh```

- If you idle for a long time, the connection would be closed forcedly. If so, run "ollama serve" again.

  ```ollama serve```

- Then <font color=#FF0000>**execute this block again**</font> to download the LLM.(Ollama library: https://ollama.com/library)
  - In this tutorial, we'll use the model llama3.2:1b.

  ```ollama pull llama3.2:1b```

In [None]:
%xterm

## Ollama testing

In [None]:
# Setting up the model that this tutorial will use
MODEL = "llama3.2:1b" # https://ollama.com/library/llama3.2:3b
EMBED_MODEL = "jinaai/jina-embeddings-v2-base-en"

In [None]:
# Initialize an instance of the Ollama model
llm = Ollama(model=MODEL)
# Invoke the model to generate responses
response = llm.invoke("What is the capital of Taiwan?")
print(response)

## Build a simple RAG system by using LangChain

In [None]:
# Initialize the Llama 3 model
llm_model = Ollama(model=MODEL)

# Create an embedding model
model_kwargs = {'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
embeddings_model = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
# Prompt setting
system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
# print(prompt)

- For the vectorspace, the common algorithm would be used like Faiss, Chroma...(https://python.langchain.com/docs/integrations/vectorstores/) to deal with the extreme huge database.

In [None]:
# Prepare documents
documents = [
    Document(page_content="The capital of Florida is Tallahassee.", metadata={"id": 0}),
    Document(page_content="Florida is known for its beautiful beaches and warm climate.", metadata={"id": 1}),
    Document(page_content="The largest city in Florida by population is Jacksonville.", metadata={"id": 2}),
    Document(page_content="The President of Miami Dade College is President Madeline Pumariega.", metadata={"id": 3}),
    Document(page_content="The Provost of Miami Dade College is Dr. Malou C. Harrison.", metadata={"id": 4}),
    Document(page_content="Dr. Ernesto Lee is an AI and Data Analytics Professor on the Kendall Campus at Miami Dade College.", metadata={"id": 5})
]


In [None]:
# Create Chroma vector store
# search_type could be “similarity” (default), “mmr”, or “similarity_score_threshold”
vector_store = Chroma.from_documents(documents, embedding=embeddings_model)
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3, "fetch_k": 5})

In [None]:
# Load the QA chain
question_answer_chain = create_stuff_documents_chain(llm=llm_model, prompt=prompt) # Create a chain for passing a list of Documents to a model.
# print(question_answer_chain)

chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=question_answer_chain) # Create retrieval chain that retrieves documents and then passes them on.
# print(chain)

In [None]:
# Use the QA chain to retrieve relevant documents and generate a response
queries = [
    "What is the capital of Florida?",
    "Who is the President of Miami Dade College?",
    "Who is the Provost of Miami Dade College?",
    "Who is Dr. Ernesto Lee?"
]

for query in queries:
    response = chain.invoke({"input": query})
    print(f"Query: {query}\nResponse: {response}\n")
