In [None]:
!pip install langchain-openai langchain-community huggingface_hub PyPDF2 langchain-huggingface faiss-cpu langchain-groq

Collecting langchain-openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.3.5-py3-none-any.whl.metadata (2.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.meta

In [None]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from huggingface_hub import login
from langchain_groq import ChatGroq
import os

In [None]:
load_dotenv()
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import files
uploaded = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
pdf_paths = [f"/content/{filename}" for filename in uploaded.keys()]
print(f"selected PDFs: {pdf_paths}")

In [None]:
def get_pdf_text(pdf_docs):
  '''
  loop over all the pdf files and concatenate
  the content in a single string
  '''
  text = ""
  for pdf in pdf_paths:
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
      page_text = page.extract_text()
      if page_text:
        text += page_text
    return text


In [None]:
raw_text = get_pdf_text(pdf_paths)
print("✅ Extracted text length:", len(raw_text))

In [None]:
def get_text_chunks(raw_text):
  '''
  takes a single string of text and returns a list
  of text strings that can be fed to vector database
  '''
  text_splitter = CharacterTextSplitter(
      separator="\n",
      chunk_size=1000,
      chunk_overlap=200,
      length_function=len
  )
  chunks = text_splitter.split_text(raw_text)
  return chunks


In [None]:
text_chunks = get_text_chunks(raw_text)
print("✅ Number of text chunks:", len(text_chunks))

In [None]:
def get_vectorstore(text_chunks):
  '''
  creat a FAISS vectorstore from the text chunks using embeddings
  '''
  embeddings = HuggingFaceEmbeddings(
      model_name="hkunlp/instructor-xl",
      model_kwargs={"device": "cpu"}
  )
  vectorstore = FAISS.from_texts(
      texts=text_chunks,
      embedding=embeddings
  )
  return vectorstore


In [None]:
vectorstore = get_vectorstore(text_chunks)
print("✅ vectorstore created")

In [None]:
def get_conversation_chain(vectorstore):
    '''
    Creates a conversational retrieval chain using OpenAI's mode
    '''
   #llm = ChatOpenAI(
     # model_name= "gpt-4o-mini",
     # temperature=0.7,
     # request_timeout=30
    llm = ChatGroq(
         model_name="llama-3.3-70b-versatile",
         temperature=0.7,
         request_timeout=30,
         api_key="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"
    )
    memory = ConversationBufferMemory(
        memory_key='chat_history',
        return_messages=True
    )
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain


In [None]:
conversation = get_conversation_chain(vectorstore)

In [None]:
def chat_with_pdf(question):
    response = conversation({'question': question})
    for i, msg in enumerate(response['chat_history']):
      sender = "Query````````````1                                        " if i % 2 == 0 else "Timeline"
      print(f"{sender}: {msg.content}\n")

In [None]:
chat_with_pdf("Events occured in 1985?")
chat_with_pdf("Timeline of Europian History?")
chat_with_pdf("What happened in 1938?")