In [None]:
!pip install langchain-community
!pip install chromadb
!pip install sentence_transformers
!pip install pypdf
!pip install tiktoken
!pip install chromadb

In [2]:
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings

In [4]:
loader = DirectoryLoader("/content/ipc-data", glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [5]:
texts

15148

In [7]:

embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

subset_texts = texts[:30]

persist_directory = "/content/db_directory"
db = Chroma.from_documents(subset_texts, embeddings, persist_directory=persist_directory)

db.persist()

db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

print(f"Number of chunks in subset: {len(subset_texts)}")
print("Database created and persisted with the subset of text chunks.")


Number of chunks in subset: 30
Database created and persisted with the subset of text chunks.


  warn_deprecated(
  warn_deprecated(


In [33]:
checkpoint = "MBZUAI/LaMini-Flan-T5-783M"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype=torch.float32
)

In [35]:
pipe = pipeline(
    'text2text-generation',
    model = base_model,
    tokenizer = tokenizer,
    max_length = 512,
    do_sample = True,
    temperature = 0.3,
    top_p= 0.95
)


In [36]:
local_llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    return_source_documents=True,
)


In [56]:
input_query = str(input("Enter your query:"))

llm_response = qa_chain({"query": input_query})

print(llm_response['result'])


Enter your query:what is ipc
IPC stands for Indian Penal Code.


In [12]:

tokenizer.save_pretrained("tokenizer_model")
base_model.save_pretrained("base_model")


