In [5]:
import fitz  # PyMuPDF
import sqlite3
from transformers import T5ForConditionalGeneration, T5Tokenizer
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# Connect to the database
conn = sqlite3.connect('pdf_texts.db')
c = conn.cursor()

# Drop the previous table
c.execute("DROP TABLE IF EXISTS pdf_texts")

# Create a new table to store PDF texts and their vector representations
c.execute('''CREATE TABLE IF NOT EXISTS pdf_texts
             (id INTEGER PRIMARY KEY, filename TEXT, text TEXT, vector TEXT)''')

# Function to save text from a PDF file to the database
def save_pdf_text_to_db(pdf_filename):
    doc = fitz.open(pdf_filename)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()

    # Vectorize the text
    vector = vectorize_text(text)

    c.execute("INSERT INTO pdf_texts (filename, text, vector) VALUES (?, ?, ?)", (pdf_filename, text, vector))
    conn.commit()

# Function to vectorize text
def vectorize_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Create a TaggedDocument object
    tagged_data = TaggedDocument(words=words, tags=[1])
    # Initialize and train the Doc2Vec model
    model = Doc2Vec([tagged_data], vector_size=100, window=2, min_count=1, workers=4)
    # Get the text vector
    vector = model.infer_vector(words)
    return vector

# Function to find text by filename
def find_text_by_filename(filename):
    c.execute("SELECT text, vector FROM pdf_texts WHERE filename=?", (filename,))
    return c.fetchone()

# Function to answer questions using the T5 model
def answer_question(pdf_filename, question):
    # Get text from the database
    found_text = find_text_by_filename(pdf_filename)
    if found_text:
        document_text, document_vector = found_text
    else:
        # If text is not in the database, save it
        save_pdf_text_to_db(pdf_filename)
        document_text, document_vector = find_text_by_filename(pdf_filename)
    
    # Load the T5 model and tokenizer
    model_name = "t5-large"
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    # Create input text for the T5 model
    input_text = f"question: {question} context: {document_text}"

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the answer using the T5 model
    start_time_qa = time.time()
    answer_ids = model.generate(inputs, max_length=100)
    end_time_qa = time.time()

    # Decode the answer from tokens to text
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    return answer, end_time_qa - start_time_qa

# Function to display remedial action instructions from the PDF
def display_remedial_action(pdf_filename):
    doc = fitz.open(pdf_filename)
    remedial_action = ""

    # Search for the "Remedial Action" section
    for page in doc:
        page_text = page.get_text()
        if "Remedial Action" in page_text:
            remedial_action += page_text
            break

    doc.close()
    return remedial_action

# Example usage of the functions
pdf_filename = '/Users/geoyak/Desktop/pythonProject1/Coffee.pdf'
question = "How can I resolve the issue of my coffee being trapped in the machine?"

# Get the answer to the question
answer, processing_time = answer_question(pdf_filename, question)
print("Question:", question)
print("Answer:", answer)
print("Processing time:", processing_time, "seconds")

# Display remedial action instructions
remedial_action = display_remedial_action(pdf_filename)
if remedial_action:
    print("Remedial Action Instructions:")
    print(remedial_action)

# Close the connection to the database
def close_connection():
    conn.close()

close_connection() 



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Question: How can I resolve the issue of my coffee being trapped in the machine?
Answer: programming the coffee quantity
Processing time: 2.5438408851623535 seconds
