In [2]:
!pip install langchain
!pip install langchain-community
!pip install langchain-text-splitters
!pip install transformers
!pip install sentence-transformers
!pip install faiss-cpu
!pip install pypdf

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.2-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-text-splitters<2.0.0,>=1.1.0 (from langchain-classic<2.0.0,>=1.0.0->langchain-community)
  Downloading langchain_text_splitters-1.1.0

In [1]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [17]:
#Load PDF
loader = PyPDFLoader("A Novel Deep Learning Approach for Myocardial Infarction.pdf")
documents = loader.load()
print("Loaded pages:", len(documents))

#Split document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
docs = text_splitter.split_documents(documents)
print("Total chunks created:", len(docs))

#Create embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

#Store embeddings in FAISS
vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})        #will fetch top 4 relevant chunks during question answering.

Loaded pages: 19
Total chunks created: 231


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [18]:
#Load LLM FLAN-T5
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



In [106]:
#RAG
question = "What are DL Classifiers used in this research?"

#Retrieve relevant chunks
relevant_docs = retriever.invoke(question)

#Combine top 2 chunks
context = "\n\n".join([doc.page_content for doc in relevant_docs[:2]])

#Trim long context
context = context[:1200]

In [107]:
#Build grounded prompt
prompt = f"""
You are an AI research assistant.
Answer the question clearly using the context below.

Context:
{context}

Question: {question}

Answer in 2-3 complete sentences:
"""

In [108]:
#GENERATION BLOCK
#Tokenize prompt
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

#Move inputs to same device as model
inputs = {k: v.to(device) for k, v in inputs.items()}

#Generate output
outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    do_sample=False
)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

import textwrap
formatted_answer = textwrap.fill(answer, width=80)  #wrap at 80 chars
print("Answer:\n", formatted_answer)

Answer:
 (RNN, DNN, CNN, and LSTM)
