In [104]:
import io
import requests
import PyPDF2


def split_pdf_pages(pdf_data):
    pages = []
    reader = PyPDF2.PdfFileReader(io.BytesIO(pdf_data))

    for page_number in range(reader.getNumPages()):
        if reader.getPage(page_number):
            pages.append(reader.getPage(page_number))

    return pages

In [105]:
urls = [
    "https://file.notion.so/f/s/70234c16-e61d-48fc-a26e-a2a573afaee1/Woonzorgvisie_Gemeente_Oisterwijk_2022_-_2027_-_02-05-2022.pdf?id=387149cf-e0c7-4f5f-9b76-250ac04d8e9d&table=block&spaceId=ca274338-dfb2-4f65-823d-2acde6195fea&expirationTimestamp=1685090661894&signature=SZ8bKZJmrGVF9wLYb_eZkI7Ozgmssh7TmBFyTzw1T5E&downloadName=Woonzorgvisie+Gemeente+Oisterwijk+2022+-+2027+-+02-05-2022.pdf"]

In [106]:
import urllib.parse

def extract_title(url):
    parsed_url = urllib.parse.urlparse(url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    download_name = query_params.get("downloadName", [None])[0]
    return download_name

In [107]:
def count_keywords_in_string(string):
    keyword_count = 0
    for keyword in keywords:
        keyword_count += string.lower().count(keyword.lower())
    return keyword_count

In [108]:
anton = []
for url in urls:
    response = requests.get(url)
    pdf_data = response.content
    pdf_pages = split_pdf_pages(pdf_data)
    for page_number, page in enumerate(pdf_pages):
        contents = page.extract_text()
        anton.append(contents)

In [109]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import string

nlp = spacy.load("nl_core_news_sm")
punctuations = string.punctuation


def tokenize_text(text):
    tokens = nlp(str(text))
    keywords = []

    for token in tokens:
        if token.is_punct or token.is_currency:
            continue
        elif token.pos_ in ["NOUN", "PROPN", "ADJ"]:
            keywords.append(token.lemma_)
    return keywords


def preprocess_text(text):
    if isinstance(text, tuple):
        text = ' '.join(str(t) for t in text)
    return text.lower()


vectorizer = TfidfVectorizer(stop_words=nlp.Defaults.stop_words, tokenizer=tokenize_text, preprocessor=preprocess_text)
tfidf_matrix = vectorizer.fit_transform(anton)
feature_names = vectorizer.get_feature_names_out()

# Step 4: Determine the most important keywords from the TF-IDF analysis
num_keywords = 1  # Adjust the number of keywords you want to extract
useful_keywords = []
for i, document in enumerate(anton):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    top_indices = tfidf_scores.argsort()[-num_keywords:][::-1]
    top_keywords = [feature_names[idx] for idx in top_indices]
    useful_keywords.extend(top_keywords)
    # top_weights = [tfidf_scores[idx] for idx in top_indices]
    #
    # for keyword, weight in zip(top_keywords, top_weights):
    #     print("Keyword:", keyword)
    #     print("Weight:", weight)
    #     print("---")

useful_keywords = list(set(useful_keywords))  # Remove duplicate keywords
keywords = useful_keywords
useful_keywords

['aanpak',
 'urgentie',
 'jaar',
 'landschap',
 'inschrijftijd',
 'matchingpunt',
 'karakter',
 'entameren',
 'omgevingsvisie',
 'centrum',
 'waterhoef',
 'plan',
 'koop',
 'inkomen',
 'vergunninghouders',
 'kerkhof',
 'werking',
 'beperking',
 'mantelzorg',
 'egw',
 'heukelom',
 'betaalbaar',
 'één',
 'lokaal',
 'bruto',
 'begrip',
 'tijdelijk',
 'bijlage',
 'dorp',
 'wijk',
 'standplaat',
 'woningtype',
 'groen',
 'overeenkomst',
 'oisterwijk',
 'eengezinswoning',
 'sociaal',
 'haaren',
 'creatief',
 'integraal',
 'woning',
 'ruimte',
 'titel',
 'visie',
 'oud',
 'regionaal',
 'voorontwerp']

In [None]:
import requests
from io import BytesIO
from PyPDF2 import PdfFileReader
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class PDFPageExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        extracted_pages = []
        for url in X:
            response = requests.get(url)
            pdf_data = response.content
            pdf_pages = self.split_pdf_pages(pdf_data)
            extracted_pages.extend(pdf_pages)
        return extracted_pages

    def split_pdf_pages(self, pdf_data):
        pdf = PdfFileReader(BytesIO(pdf_data))
        return [pdf.getPage(i) for i in range(pdf.getNumPages())]


class PageContentFilter(BaseEstimator, TransformerMixin):
    def __init__(self, keywords, max_keyword_count=20):
        self.keywords = keywords
        self.max_keyword_count = max_keyword_count

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        filtered_pages = []
        for page_number, page in enumerate(X):
            contents = page.extract_text()
            if contents == "" or contents == " ":
                print(f"BLANK {page_number}")
                continue
            if not any(keyword in contents for keyword in self.keywords):
                print(f"Not any keywords found {page_number}")
                continue
            if self.count_keywords_in_string(contents) <= self.max_keyword_count:
                print(f"Not enough keywords found {page_number}")
                continue
            filtered_pages.append((page_number, contents))
        return filtered_pages

    def count_keywords_in_string(self, text):
        count = 0
        for keyword in self.keywords:
            count += text.lower().count(keyword.lower())
        return count


# Create the pipeline
pipeline = Pipeline([
    ('pdf_extractor', PDFPageExtractor()),
    ('page_filter', PageContentFilter(keywords))

])

# Apply the pipeline to the URLs
filtered_pages2 = pipeline.transform(urls)

#Print the filtered pages
# for page_number, contents in filtered_pages2:
#     print("Page number:", page_number)

Not enough keywords found 0
Not enough keywords found 1
Not enough keywords found 2
Not enough keywords found 4
Not enough keywords found 9
