# PDFix Evaluation

### Import libraries

In [None]:
import csv
import re
import html

import pandas as pd
import numpy as np

from tqdm import tqdm
from random import randrange
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize


### Load WOO dataframe

In [None]:
# Parameters
woo_dossier_path = "../woo_dossiers.csv"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
woo = pd.read_csv(woo_dossier_path)

print("Dataframed loaded...")
beslis = woo[woo["dc_type"] == '2e-b']

beslis

### (Save)/ Open row numbers

In [None]:
# Get list of 25 random rownumbers 
rows = []
for x in range(25):
    row = randrange(len(beslis))
    while row in rows:
        row = randrange(len(beslis))
    rows.append(row)

# Save to file
with open('rows.csv', 'w') as file:
    write = csv.writer(file)

    write.writerow(rows)
    
# Open rows
file = open("rows.csv", "r")
rows = [int(x) for x in list(csv.reader(file, delimiter=","))[0]]
file.close()

print("len =", len(rows),": ", rows)

### Add data to dataframe

In [None]:
# Create new dataframe
df = pd.DataFrame(columns=["dc_identifier", "dc_title", "dc_description", "dc_publisher_name", "dc_source", "foi_publishedDate"])

# Add data from woo_dossiers to dataframe
for x in range(len(rows)):
    df.loc[x] = beslis.iloc[rows[x]][["dc_identifier", "dc_title", "dc_description", "dc_publisher_name", "dc_source", "foi_publishedDate"]]

df.head(5)

### Download files to folder

In [None]:
def download_pdf(woo, row_numbers, row_index, folder_path):  
    baseURL = "https://open.overheid.nl/documenten"
    source = woo.iloc[row_numbers[row_index]]['dc_source']
    doi = source[32:-2]
    suffixURL = "/pdf"
    URL = baseURL + doi + suffixURL

    print(source)
    print(doi)
    print(URL)

    filename = f"pdf{row_index}.pdf"
    file_path = folder_path + "/" + filename

    try:
        urlretrieve(URL, file_path)
    except:
        suffixURL = "/file"
        URL = baseURL + doi + suffixURL
        urlretrieve(URL, file_path)
    
    print(f"File downloaded to {file_path}")
    return URL, file_path

In [None]:
for x in range(len(rows)):
    download_url, file_path = download_pdf(beslis, rows, x, "pdfs")

    df.loc[x, "download_url"] = download_url
    df.loc[x, "file_path"] = file_path
    

### Save dataframe to csv

In [None]:
df.to_csv("df.csv", index=False)

### Open dataframe from csv

In [None]:
df = pd.read_csv("df.csv")
df

### Add pdftotext to dataframe

In [None]:
for x in range(len(rows)):
    file_path = f"pdfs/pdf{x}.pdf"
    txt_path = f"pdfs/pdf{x}.txt"
    !pdftotext "$file_path"
    file = open(txt_path)
    text = file.read()
    file.close()
    !rm "$txt_path"

    df.loc[x, "pdftotext"] = text
    df.loc[x, "nCharacters"] = len(text)

In [None]:
from accessibleHTML import set_metadata, init_analyzer, build_html

In [None]:
for x in tqdm(range(len(rows))):
    if x == 0:
        continue
    file_path = f"pdfs/pdf{x}.pdf"
    metadata = set_metadata(df.iloc[x], file_path)
    document = init_analyzer(file_path)
    html = build_html(
        doc=document,
        metadata=metadata
    )
    df.loc[x, "accessible_html"] = html

df

In [None]:
for x in range(len(rows)):
    html_content = df.loc[x]["accessible_html"]

    soup = BeautifulSoup(html_content, 'html.parser')
    html_text = soup.get_text()

    df.loc[x, "html_text"] = html_text

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

for x in range(len(rows)):
    corpus = []
    corpus.append(df.loc[x]["pdftotext"])
    corpus.append(df.loc[x]["html_text"])

    # Tekstgegevens omzetten naar matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)

    # Bereken de cosine similarity tussen de eerste twee zinnen
    cosine_sim = cosine_similarity(X[0], X[1])
    df.loc[x, "cosine_sim"] = cosine_sim[0][0]

    # Binaire representaties gebruiken voor Jaccard similarity
    X_binary = (X > 0).astype(int)

    # Bereken de Jaccard similarity tussen de eerste twee zinnen
    jaccard_sim = jaccard_score(X_binary[0].toarray(), X_binary[1].toarray(), average='samples')
    df.loc[x, "jaccard_sim"] = jaccard_sim

df

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

for x in range(len(rows)):
    corpus = []
    corpus.append(df.loc[x]["pdftotext"])
    corpus.append(df.loc[x]["html_text"])

    # Tekstgegevens omzetten naar matrix
    vectorizer = CountVectorizer(ngram_range=(1, 3))
    X = vectorizer.fit_transform(corpus)

    # Bereken de cosine similarity tussen de eerste twee zinnen
    cosine_3gram = cosine_similarity(X[0], X[1])
    df.loc[x, "3gram_cosine"] = cosine_3gram[0][0]

    # Binaire representaties gebruiken voor Jaccard similarity
    X_binary = (X > 0).astype(int)

    # Bereken de Jaccard similarity tussen de eerste twee zinnen
    jaccard_3gram = jaccard_score(X_binary[0].toarray(), X_binary[1].toarray(), average='samples')
    df.loc[x, "3gram_jaccard"] = jaccard_3gram

df.head(25)

In [None]:
print("Cosine similarity: " + str(df["cosine_sim"].mean()))
print("Jaccard similarity: " + str(df["jaccard_sim"].mean()))
print("3gram cosine similarity: " + str(df["3gram_cosine"].mean()))
print("3gram jaccard similarity: " + str(df["3gram_jaccard"].mean()))


print("Cosine similarity STD: " + str(df["cosine_sim"].std()))
print("Jaccard similarity STD: " + str(df["jaccard_sim"].std()))
print("3gram cosine similarity STD: " + str(df["3gram_cosine"].std()))
print("3gram jaccard similarity STD: " + str(df["3gram_jaccard"].std()))