In [1]:
!pip install PyPDF2 faiss-cpu sentence_transformers transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Building

In [12]:
from PyPDF2 import PdfReader
from faiss import IndexFlatL2
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np

In [14]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")


In [None]:
def process_pdf(pdf_path):
    if pdf_path is not None:
        pdf_reader = PdfReader(pdf_path)

        text = ""

        for page in pdf_reader.pages:
            text += page.extract_text() or ""

        return text

In [None]:
def embed_text(text):
    return model.encode(text)

In [None]:
def build_vector_store(texts):

    vectors = [embed_text(text) for text in texts]

    vectors_np = np.array(vectors).astype('float32')

    index = IndexFlatL2(vectors_np.shape[1])
    index.add(vectors_np)
    return index

In [None]:
def query_vector_store(query, index, texts):
    query_embedding = embed_text(query)
    distances, indices = index.search(query_embedding.reshape(1, -1), k=5)
    return [texts[i] for i in indices[0]]

In [None]:
def answer_question(question, passage):
    answer = qa_model(question=question, context=passage)
    return answer["answer"]


In [None]:
def chat(pdf_path):

    text = process_pdf(pdf_path)
    sentences = text.strip().split("\n")
    vector_store = build_vector_store(sentences)

    # Chat loop
    while True:
        user_query = input("Ask your question: ")
        if user_query == "quit":
            break

        relevant_sentences = query_vector_store(user_query, vector_store, sentences)
        for sentence in relevant_sentences:
            answer = answer_question(user_query, sentence)
            print(f"\nPossible Answer: {answer}")

        print("\n")

In [13]:
# Run the chatbot
chat("222.pdf")

Ask your question: What is CNN?

Possible Answer: a CNN

Possible Answer: CNN

Possible Answer: models

Possible Answer: expressive power

Possible Answer: models


Ask your question: Rnn

Possible Answer: 16 to 128 neu rons

Possible Answer: N.

Possible Answer: networks

Possible Answer: LeNet

Possible Answer: 1


Ask your question: LeNet

Possible Answer: LeNet

Possible Answer: Advantages of LeNet

Possible Answer: 21 List the advantages of LeNet. How many layers

Possible Answer: convolutional and pooling layers

Possible Answer: AlexNet


Ask your question: What the PDF about

Possible Answer: Characteristics

Possible Answer: preservatio n

Possible Answer: Here's how it can be applied

Possible Answer: large-scale and diverse set of labelled  images

Possible Answer: ca tegorize images into different classes or labels


Ask your question: Title

Possible Answer: 1

Possible Answer: 1

Possible Answer: introduction

Possible Answer: backpropagation

Possible Answer: Characteris

KeyboardInterrupt: ignored