In [1]:
from dotenv import load_dotenv
import os   

load_dotenv(override=True)

os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("my_research_paper.pdf")

docs = loader.load()


In [5]:
docs

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T19:51:56+05:00', 'author': 'IJIETAP Assistant Editor', 'moddate': '2025-07-31T19:51:56+05:00', 'source': 'my_research_paper.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content="1 \n \nTHE DEVELOPMENT OF ARTIFICIAL INTELLIGENCE-BASED OPTIMAL \nROUTE SELECTION FRAMEWORK FOR RESCUE SERVICES PROCESS \nMANAGEMENT \nFahad M. Alqahtani 1, Imtisal Ahmad Hashmi 2, Imran Ahmad 2, Irfan Ahmed 3,4*, Mohammed Alkahtani 1 \n1 Industrial Engineering Department, College of Engineering, King Saud University, P.O. Box 800, Riyadh 11421, Saudi \nArabia; afahad@ksu.edu.sa, moalkahtani@ksu.edu.sa;  \n2 Department of Industrial Engineering, University of Engineering and Technology, Peshawar 25000, Pakistan; \nengrimtisalahmadhashmi@gmail.com, imranahmad@uetpeshawar.edu.pk \n3 Ayass BioScience LLC, Frisco, TX, USA,  \n4   Department of Electrical En

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

texts = text_splitter.split_documents(docs)

In [8]:
texts[:5]

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T19:51:56+05:00', 'author': 'IJIETAP Assistant Editor', 'moddate': '2025-07-31T19:51:56+05:00', 'source': 'my_research_paper.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='1 \n \nTHE DEVELOPMENT OF ARTIFICIAL INTELLIGENCE-BASED OPTIMAL \nROUTE SELECTION FRAMEWORK FOR RESCUE SERVICES PROCESS \nMANAGEMENT \nFahad M. Alqahtani 1, Imtisal Ahmad Hashmi 2, Imran Ahmad 2, Irfan Ahmed 3,4*, Mohammed Alkahtani 1 \n1 Industrial Engineering Department, College of Engineering, King Saud University, P.O. Box 800, Riyadh 11421, Saudi \nArabia; afahad@ksu.edu.sa, moalkahtani@ksu.edu.sa;  \n2 Department of Industrial Engineering, University of Engineering and Technology, Peshawar 25000, Pakistan; \nengrimtisalahmadhashmi@gmail.com, imranahmad@uetpeshawar.edu.pk \n3 Ayass BioScience LLC, Frisco, TX, USA,  \n4   Department of Electrical En

In [26]:
model_name = "google/gemma-3-1b-it"
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name=model_name)

No sentence-transformers model found with name google/gemma-3-1b-it. Creating a new one with mean pooling.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of Gemma3TextModel were not initialized from the model checkpoint at google/gemma-3-1b-it and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.post_feedforward_layernorm.weight', 'layers.0.pre_feedforward_layernorm.weight', 'layers.0.self_attn.k_norm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_norm.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernor

In [28]:
from langchain_community.vectorstores import FAISS
import faiss

db = FAISS.from_documents(texts, embeddings)

In [29]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [30]:
pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer, 
                device=-1, # 0 for cuda and -1 for cpu
                temperature=0.1)

Device set to use cpu


In [31]:
llm = HuggingFacePipeline(pipeline=pipe)

In [33]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer
Make sure the answer is well structured and easy to understand
<context>
{context}
</context>
Question: {input}""")

In [34]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm,prompt)

In [35]:
#initializing the retriever

retriever = db.as_retriever()

retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000025D928D7380>, search_kwargs={})

In [36]:
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [37]:
question ="What are the main contributions of the paper?"

In [38]:
response = retrieval_chain.invoke({"input": question})

In [40]:
response

{'input': 'What are the main contributions of the paper?',
 'context': [Document(id='96a09982-a5de-4b6e-9d5e-4ac75abbff52', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T19:51:56+05:00', 'author': 'IJIETAP Assistant Editor', 'moddate': '2025-07-31T19:51:56+05:00', 'source': 'my_research_paper.pdf', 'total_pages': 22, 'page': 4, 'page_label': '5'}, page_content='the output for binary or multiclass classification. It involves  initializing weights, feedforward propagation, loss calculation, \nbackpropagation, and iterative training mechanisms. Mathematically it can be written as:'),
  Document(id='f031f360-5fac-4681-9c21-be7a9f09fbde', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T19:51:56+05:00', 'author': 'IJIETAP Assistant Editor', 'moddate': '2025-07-31T19:51:56+05:00', 'source': 'my_research_paper.pdf', 'tot