In [1]:
# import of needed libraries
import fitz # wczytywanie pdf
import pandas as pd 
import re # RegEx
import collections # for Counter
from scipy.spatial import distance # for the similarity measure
import math
import copy
import os

In [3]:
def load_pdfs_from_folder(folder_path):
    """Returns list of pdf ducuments in folder and loads it
    """
    pdf_list = []
    
    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            # Construct the full path to the PDF file
            pdf_path = os.path.join(folder_path, filename)
            
            # Open the PDF file using PyMuPDF's fitz module
            pdf_document = fitz.open(pdf_path)
            
            # Append the PDF document object to the list
            pdf_list.append(pdf_document)
    
    return pdf_list

In [4]:
# uploading of data
folder_path = (r'C:\Users\User\questions_duplicates')
pdf_documents = load_pdfs_from_folder(folder_path)
text = ''
for doc in pdf_documents:
    for page in range(doc.page_count):
        text += doc[page].get_text()

In [None]:
# we only leave questions
text = re.sub(re.escape('Baza pytań do Lekarskiego Egzaminu Końcowego') + r'(.*?)' + '[Nn]r', 'Nr', text, flags=re.DOTALL) 

In [5]:
# creation of a list of questions based on the search 'Nr'
data_list = []
temp = []
question_numbers = text.count('Nr') + text.count('nr') 
pattern = re.compile(r'[Nn]r')
matches = pattern.finditer(text)
for match in matches:
    temp.append(match.end())
    
for i in range(question_numbers-1):
    data_list.append(text[temp[i]+3:temp[i+1]-2])
data_list.append(text[temp[len(temp)-1]+4:None])

In [15]:
# remove subpoints (A-E), end-of-line characters, more than 1 break, and colons, periods etc.
for question_number in range(question_numbers):
    data_list[question_number] = re.sub(r'Pytanie|[A-E]\.|\n|:|\.|;|[0-9]\)|,', '', data_list[question_number])
    data_list[question_number] = data_list[question_number].lower()

In [8]:
# we create a dictionary list with the counted words in each question
data_bag = []
for question in data_list:
    data_bag.append(dict(collections.Counter(question.split())))

In [10]:
def docs_num_contain(term, data):
    """Returns the number of documents containing the term in the dataset data
    """
    temp = 0
    for doc_num in range(len(data)):
        if term in data[doc_num]:
            temp += 1
    return temp

In [11]:
# we create a list of dictionaries with TFIDF values
data = copy.deepcopy(data_bag)
for doc_num in range(question_numbers):
    temp_list = list(data[doc_num])
    temp_n = sum(data[doc_num].values())
    for term in temp_list:
        data[doc_num][term] = data[doc_num][term]/temp_n * math.log10(question_numbers / docs_num_contain(term,data_bag))

In [12]:
def cosine_distance(dict1, dict2):
    """Calculates the cosine distance between any two dictionaries
    """
    keys = set(dict1.keys()) | set(dict2.keys())

    vector1 = [dict1.get(key, 0) for key in keys]
    vector2 = [dict2.get(key, 0) for key in keys]

    dot_product = sum(x * y for x, y in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(x ** 2 for x in vector1))
    magnitude2 = math.sqrt(sum(x ** 2 for x in vector2))

    if magnitude1 == 0 or magnitude2 == 0:
        return 1.0 

    distance = 1 - (dot_product / (magnitude1 * magnitude2))
    return distance

In [13]:
# calculates distances between questions from the base
distances = {}

for i in range(question_numbers):
    for j in range(i + 1, question_numbers):
        distances[(i, j)] = cosine_distance(data[i],data[j])

In [14]:
# we display n most similar questions
n = 5
sorted_items = sorted(distances.items(), key=lambda x: x[1], reverse=False)[:n]

for key, value in sorted_items:
    print(key, value)

(83, 86) 0.05456026242709833
(43, 47) 0.22488379121848634
(20, 76) 0.4004181616714617
(43, 45) 0.4352047512139142
(24, 26) 0.45745245104082477
