In [44]:
import fitz  # PyMuPDF
from transformers import pipeline
import os
import time

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File {pdf_path} not found.")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Initialize the pipeline for question answering using the specialized BERT model
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Path to your PDF file
pdf_path = '/Users/geoyak/Desktop/pythonProject1/stripe-startup.pdf'

# Extract text from the PDF
start_time = time.time()
document_text = extract_text_from_pdf(pdf_path)
end_time = time.time()

# Question to find the answer for based on the document text
question = "Where are the top startups located?"

# Search for the answer
start_time_qa = time.time()
answer = qa_pipeline(question=question, context=document_text)
end_time_qa = time.time()

# Print the answer with the context
print("Question:", question)
print("Answer:", answer['answer'])
#print("Context:")
#print(document_text)
print("Time to extract text from PDF: {:.2f} seconds".format(end_time - start_time))
print("Time to find answer: {:.2f} seconds".format(end_time_qa - start_time_qa))


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: Where are the top startups located?
Answer: US, the UK, and France
Time to extract text from PDF: 0.33 seconds
Time to find answer: 14.35 seconds


In [45]:
import fitz  # PyMuPDF
from transformers import pipeline
import os
import time

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File {pdf_path} not found.")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Initialize the pipeline for question-answering using the specialized BERT model
qa_pipeline = pipeline("question-answering", model="mrm8488/bert-small-finetuned-squadv2")

# Path to your PDF file
pdf_path = '/Users/geoyak/Desktop/pythonProject1/stripe-startup.pdf'

# Extract text from the PDF
start_time = time.time()
document_text = extract_text_from_pdf(pdf_path)
end_time = time.time()

# Question to ask based on the document text
question = "Where are the top startups located?"

# Search for the answer
start_time_qa = time.time()
answer = qa_pipeline(question=question, context=document_text)
end_time_qa = time.time()

# Print the answer along with the context
print("Question:", question)
print("Answer:", answer['answer'])
#print("Context:")
#print(document_text)
print("Time to extract text from PDF: {:.2f} seconds".format(end_time - start_time))
print("Time to find answer: {:.2f} seconds".format(end_time_qa - start_time_qa))



Some weights of the model checkpoint at mrm8488/bert-small-finetuned-squadv2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: Where are the top startups located?
Answer: the Bay Area, New York, Los Angeles, Miami, and Austin
Time to extract text from PDF: 0.14 seconds
Time to find answer: 1.03 seconds
