In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.26-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.22-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.26 PyMuPDFb-1.23.22


In [3]:
import fitz  # PyMuPDF library

def extract_text_by_columns(pdf_path):
    with fitz.open(pdf_path) as pdf_file:
        text = ""
        for page in pdf_file:
            texts = page.get_text("blocks")
            for block in texts:
                bbox = block[0]  # Bounding box of the block
                if bbox < 300:  # limit to separate columns
                    text += block[4]
                else:
                    text += block[4]
            text += "\n"  # Add newline after each page
            text = text.replace("\r", "")
            text = text.replace("\x17", "")
            text = text.replace("V I R G I N I A  D R I V E R ’ S  M A N U A L", "")
            text = text.replace("|", "")
            text = text.replace("   ", "")
            text = text.replace("\n", "")
    return text


# Example usage
pdf_path = r'/content/drive/MyDrive/Colab Notebooks/DMV_test.pdf'
pdf_content = extract_text_by_columns(pdf_path)
pdf_content



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

documents = pdf_content
documents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', documents)

# documents = '''6 Section 2: Signals, Signs and Pavement Markings Green light or arrow: At a green light, you may go if the way is clear. At a green arrow, you may go in the direction of the arrow if the way is clear. If you are turning without a green arrow, you must yield the right-of-way to vehicles coming from the other direction and pedestrians in the intersection. Be sure to check for less visible vehicles such as motorcycles, bicycles, and mopeds.'''
# documents = documents.split(".")

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [5]:
def generate_answer(text):
    # Transform the given text into TF-IDF vector
    text_tfidf = vectorizer.transform([text])

    # Calculating cosine similarity between the given text and documents
    text_cosine_sim = cosine_similarity(text_tfidf, tfidf_matrix)

    sim_indices = np.argsort(text_cosine_sim.flatten())[::-1]
    # print(sim_indices)

    top_similar_docs = sim_indices[:1]
    # print(top_similar_docs)

    answer = ''
    for idx in top_similar_docs:
        answer += documents[idx] + '\n\n'

    return answer.strip()

In [6]:
# Test the summarization function
input_text = "What to do at green light?"
answer = generate_answer(input_text)
print(f"\nInput Text: {input_text}")
print(f"\nAnswer: {answer}.")


Input Text: What to do at green light?

Answer: 6 Section 2: Signals, Signs and Pavement MarkingsGreen light or arrow: At a green light, you may go if the way is clear..


In [None]:
# # Save the model and sentences
# model_dir = "saved_model"
# if not os.path.exists(model_dir):
#     os.makedirs(model_dir)

# sentences_file = os.path.join(model_dir, "sentences.txt")
# with open(sentences_file, "w") as f:
#     for sentence in sentences:
#         f.write(sentence + "\n")

# Save the model to disk.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import joblib

In [8]:
documents = pdf_content
documents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', documents)

In [9]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [10]:
# Save the model to disk
joblib.dump((vectorizer, cosine_sim, documents), 'qa_model.pkl')

['qa_model.pkl']

# Use the saved model for Q and A.

In [None]:
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def generate_answer(text):
    # Load the model from disk
    vectorizer, cosine_sim, documents = joblib.load('qa_model.pkl')

    # Transform the given text into TF-IDF vector using the same vectorizer
    text_tfidf = vectorizer.transform([text])

    # Calculating cosine similarity between the given text and documents
    text_cosine_sim = cosine_similarity(text_tfidf, vectorizer.transform(documents))
    sim_indices = np.argsort(text_cosine_sim.flatten())[::-1]
    top_similar_docs = sim_indices[:1]

    answer = ''
    for idx in top_similar_docs:
        answer += documents[idx] + '\n\n'

    return answer.strip()

In [None]:
# Q and A
input_text = "What to do at green light?"
answer = generate_answer(input_text)

print(f"\nInput Text: {input_text}")
print(f"\nAnswer: {answer}")


Input Text: What to do at green light?

Answer: 6 Section 2: Signals, Signs and Pavement MarkingsGreen light or arrow: At a green light, you may go if the way is clear.
