In [1]:
# !pip install langchain rank_bm25 pypdf unstructured chromadb
# !pip install unstructured['pdf'] unstructured
# !apt-get install poppler-utils
# !apt-get install -y tesseract-ocr
# !apt-get install -y libtesseract-dev
# !pip install pytesseract

### Load the required Packages

In [2]:
!pip -q install langchain_milvus langchain_community pypdf huggingface_hub rank_bm25

In [None]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `RAG Langchain Token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `RAG Langchain Token`


In [5]:
!huggingface-cli whoami

Noxious22


In [6]:
# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_milvus import Milvus

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings, OpenAIEmbeddings
from langchain.llms import HuggingFaceHub
from langchain_community.document_loaders import PyPDFLoader

from langchain.retrievers import BM25Retriever, EnsembleRetriever

import os
import textwrap

import warnings

warnings.filterwarnings("ignore")


In [7]:
loader = PyPDFLoader("/content/Frankenstein_Project_Gutenberg_Small.pdf")
pages = loader.load_and_split()

In [8]:
# print(docs[0].page_content)
print(pages[0].page_content)

August 13th, 17—.
My affection for my guest increases every day. He excites at once my admiration and
my pity to an astonishing degree. How can I see so noble a creature destroyed by misery
without feeling the most poignant grief? He is so gentle, yet so wise; his mind is so
cultivated, and when he speaks, although his words are culled with the choicest art, yet
they ﬂow with rapidity and unparalleled eloquence.
He is now much recovered from his illness and is continually on the deck, apparently
watching for the sledge that preceded his own. Yet, although unhappy, he is not so utterly
occupied by his own misery but that he interests himself deeply in the projects of others.
He has frequently conversed with me on mine, which I have communicated to him
without disguise. He entered attentively into all my arguments in favour of my eventual
success and into every minute detail of the measures I had taken to secure it. I was easily
led by the sympathy which he evinced to use the language of

### Split Documents and Chunking

In [9]:
# create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                          chunk_overlap=100)
# chunks = splitter.split_documents(docs)
chunks = splitter.split_documents(pages)

In [10]:
chunks[0].page_content

'August 13th, 17—.\nMy affection for my guest increases every day. He excites at once my admiration and\nmy pity to an astonishing degree. How can I see so noble a creature destroyed by misery\nwithout feeling the most poignant grief? He is so gentle, yet so wise; his mind is so\ncultivated, and when he speaks, although his words are culled with the choicest art, yet\nthey ﬂow with rapidity and unparalleled eloquence.\nHe is now much recovered from his illness and is continually on the deck, apparently\nwatching for the sledge that preceded his own. Yet, although unhappy, he is not so utterly\noccupied by his own misery but that he interests himself deeply in the projects of others.\nHe has frequently conversed with me on mine, which I have communicated to him'

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# from huggingface_hub import snapshot_download

# snapshot_download(repo_id="mistralai/Mistral-7B-v0.1", cache_dir="./local_models")

In [13]:
# from langchain_community.llms import Ollama
# from langchain_community.embeddings import OllamaEmbeddings

# MODEL = 'llama3.2'
# model = Ollama(model=MODEL)
# embeddings = OllamaEmbeddings(model=MODEL)
# llm = model


# Initialize model and tokenizer
# model_name = 'mistralai/Mistral-7B-v0.1'
model_name = 'google/gemma-2b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)
llm = HuggingFaceHub(
    repo_id=model_name,
    model_kwargs={"temperature": 0.7, "max_length": 512}
)


In [14]:
# Initialize embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
# embeddings = embeddings_model.encode


### VectorStore

In [15]:
from pymilvus import MilvusClient

In [16]:
MILVUS_URL = "./hybrid_search.db"

client = MilvusClient(uri=MILVUS_URL)

if client.has_collection("LangChainCollection"):
    print("Collection exists")
else:
    client.drop_collection("LangChainCollection")

Collection exists


In [17]:
# vectorstore = Milvus.from_documents(
#     documents=pages,
#     embedding=embeddings_model,
#     # index='FLAT',
#     connection_args={
#         "uri": MILVUS_URL,
#         "index_type": 'FLAT'
#     },
#     drop_old=False,
# )

vectorstore = Milvus.from_documents(
    documents=pages,
    embedding=embeddings_model,
    # builtin_function=BM25BuiltInFunction(),  # output_field_names="sparse"),
    # vector_field=["dense", "sparse"],
    connection_args={
        "uri": MILVUS_URL,
    },
    # consistency_level="Strong",
    drop_old=False,
    index_params={"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}}

)

In [18]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [19]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

### Ensemble Retriever

In [20]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.5, 0.5])

### Prompt Template:

In [21]:
template = """
<|system|>>
You are a helpful AI Assistant that follows instructions extremely well.
Use the following context to answer user question.

Think step by step before answering the question. You will get a $100 tip if you provide correct answer.

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [22]:
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

In [23]:
chain = (
    {"context": ensemble_retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [24]:
def print_response(response, width=80):
    wrapped_text = textwrap.fill(response, width=width)
    print(wrapped_text)

In [25]:
response = chain.invoke(str("Who is Frankenstein?"))
print_response(response.split('<|assistant|>')[1])


 Victor Frankenstein is the creator and main character of Mary Shelley's novel
"Frankenstein; or, The Modern Prometheus". He is a young scientist who creates a
sentient creature in an unorthodox scientific experiment. The novel is narrated
by Robert Walton, who finds Victor Frankenstein on a ship bound for the North
Pole, and by letters written by Victor Frankenstein.
