# 1. Import Necessary Libraries

Make sure  Microsoft Visual C++ is installed on your pc

Extracting text from pdf and converting to csv

In [None]:
"""
# PDF Text Extraction and Topic Modeling

This program extracts text from PDFs, cleans it, and performs topic modeling using BERTopic.

Process steps
1. Import necessary libraries
2. Extract text from either a single or multiple research study PDFs. Path is 'studies/papers'
3. Clean it using spaCy with language support for english, dutch and french.
4. Perform topic modeling using BERTopic
5. Visualize the topics



## Requirements
python -m spacy download fr_core_web_sm
python -m spacy download nl_core_web_md
python -m spacy download en_core_web_md
Use the following Python version 3.12.0 otherwise the code will not work due to spacy compatibility.
"""

In [None]:
import csv
import logging
import os
import re
import json
from collections import Counter
from typing import List, Dict, Any

import bertopic  # For topic modeling
import pandas as pd # For data manipulation
import fitz # For PDF text extraction
import spacy  # For Dutch and French tokenization
from langdetect import detect, LangDetectException  # For language detection
from tqdm import tqdm  # For progress tracking
from wordsegment import load, segment  # For word segmentation
from spacy.lang.en.stop_words import STOP_WORDS as EN_STOP_WORDS
from spacy.lang.nl.stop_words import STOP_WORDS as NL_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FR_STOP_WORDS

# 2. Extract text from either a single or multiple research study PDFs. Path is 'studies/papers'


In [None]:
# Load configuration from a JSON file
with open("config.json") as f:
    config = json.load(f)

PDF_DIRECTORY = config["pdf_directory"]  # Folder containing PDFs
CLEANED_CSV = config["cleaned_csv"]  # Cleaned CSV path
GDPR_CSV = config["gdpr_csv"]  # GDPR CSV path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load spaCy models once to avoid repeated initialization
NLP_MODELS = {
    'en': spacy.load("en_core_web_lg"),
    'nl': spacy.load("nl_core_news_lg"),
    'fr': spacy.load("fr_core_news_lg"),
}

# Ligature replacement dictionary
LIGATURES = {
    'ﬁ': 'fi', 'ﬂ': 'fl', 'ﬃ': 'ffi', 'ﬄ': 'ffl', 'ﬀ': 'ff'
}

# Common word split fixes
COMMON_FIXES = {
    'signi ficant': 'significant', 'di fferent': 'different',
    'e ffective': 'effective', 'e ffect': 'effect',
    'chil dren': 'children', 'e ff ective': 'effective',
    'con fi dence': 'confidence'
}

# Unwanted keywords for filtering
UNWANTED_KEYWORDS = {
    'doi', 'https', 'http', 'journal', 'university', 'copyrighted',
    'taylor & francis', 'elsevier', 'published by', 'received',
    'revised', 'author(s)', 'source:', 'history:', 'keywords',
    'volume', 'downloaded', 'article', 'creative commons use',
    'authors', 'all rights reserved'
}

# Reference and Acknowledgement markers
REFERENCE_MARKERS = {'references', 'bibliography', 'acknowledgements', 'method', 'methods'}

def validate_pdf_path(pdf_path: str) -> bool:
    """Checks if the PDF file exists and is readable."""
    if not os.path.exists(pdf_path) or not os.access(pdf_path, os.R_OK):
        logging.error(f"PDF file is not accessible: {pdf_path}")
        return False
    return True

def is_heading(line: str) -> bool:
    """Determines if a line is a heading (all uppercase or starts with 'CHAPTER')."""
    return line.isupper() or line.startswith('CHAPTER')

def is_footnote(line: str) -> bool:
    """Identifies footnotes based on common patterns."""
    return bool(re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line) or 
                line.startswith(('*', 'Note', 'Table')))

def contains_doi_or_https(line: str) -> bool:
    """Checks if a line contains unwanted keywords, DOIs, or URLs."""
    return any(keyword in line.lower() for keyword in UNWANTED_KEYWORDS)

def is_reference_or_acknowledgements_section(line: str) -> bool:
    """Checks if a line marks the start of references or acknowledgements."""
    return any(marker in line.lower() for marker in REFERENCE_MARKERS)

def replace_ligatures(text: str) -> str:
    """Replaces ligatures in text with their normal character equivalents."""
    for lig, replacement in LIGATURES.items():
        text = text.replace(lig, replacement)
    return text

def fix_common_word_splits(text: str) -> str:
    """Fixes common word splits in text."""
    for split_word, correct_word in COMMON_FIXES.items():
        text = text.replace(split_word, correct_word)
    return text

def detect_language(text: str) -> str:
    """Detects the language of a given text block."""
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

def process_pdf(file_path: str, filename: str) -> list:
    """Processes a single PDF file and extracts structured text data."""
    data = []
    title = os.path.splitext(filename)[0]  # Extract filename without extension
    
    try:
        with fitz.open(file_path) as pdf_document:
            section_reached = False  # Stop processing once references start

            for page_num in range(pdf_document.page_count):
                if section_reached:
                    break  # Stop processing once references detected

                page = pdf_document.load_page(page_num)
                text_dict = page.get_text("dict")  # Extract text while keeping layout

                for block in text_dict["blocks"]:
                    if block["type"] != 0:  # Ignore non-text blocks
                        continue

                    paragraph = []
                    prev_x = None  # Track x-coordinate for paragraph grouping

                    for line in block["lines"]:
                        line_text = " ".join(span["text"].replace(';', ',') for span in line["spans"])
                        line_text = replace_ligatures(line_text)
                        line_text = fix_common_word_splits(line_text)

                        if is_reference_or_acknowledgements_section(line_text):
                            section_reached = True
                            break  # Stop further processing

                        if (is_heading(line_text) or is_footnote(line_text) or 
                            contains_doi_or_https(line_text) or 
                            line_text.strip().lower() == title.lower()):
                            continue

                        first_word_x = line["spans"][0]["bbox"][0]

                        if prev_x is None or abs(first_word_x - prev_x) < 10:
                            paragraph.append(line_text)
                        else:
                            if paragraph:  # Store completed paragraph
                                full_text = " ".join(paragraph).strip()
                                if len(full_text.split()) >= 10:
                                    language = detect_language(full_text)
                                    data.append([filename, page_num + 1, full_text, language])
                            paragraph = [line_text]  # Start new paragraph

                        prev_x = first_word_x  # Update for next iteration

                    if paragraph and not section_reached:
                        full_text = " ".join(paragraph).strip()
                        if len(full_text.split()) >= 10:
                            language = detect_language(full_text)
                            data.append([filename, page_num + 1, full_text, language])

    except Exception as e:
        logging.error(f"Failed to process {file_path}: {e}")

    return data


all_data = []

pdf_files = [f for f in os.listdir(PDF_DIRECTORY) if f.endswith('.pdf')]

for filename in tqdm(pdf_files, desc="Processing PDFs"):
    file_path = os.path.join(PDF_DIRECTORY, filename)
    if validate_pdf_path(file_path):
        all_data.extend(process_pdf(file_path, filename))

if all_data:
    df_extracted = pd.DataFrame(all_data, columns=["File", "Page", "Text", "Language"])
    df_extracted.to_csv(CLEANED_CSV, index=False) 
    logging.info(f"Data successfully exported to {CLEANED_CSV}")


In [None]:
# Load the data
df_cleaned = pd.read_csv(CLEANED_CSV)
df_cleaned.head(30)

# 3. Clean it using spaCy with language support for english, dutch and french.
We clean up the text
- Remove the name of city, country, geography for better outcome
- Remove special characters (only letters)
- Convert to lower case
- Remove stop words
- Remove words of only one or 2 letters ('a', 'I', at,...)
- Remove very short sentences
- Remove urls 
- use stemming
- remove duplicate sentences

In [None]:

# Define entity types to remove (Personal Information)
PERSONAL_ENTITIES = {"PERSON", "EMAIL", "PHONE", "GPE", "ORG"}

def remove_personal_info(text: str, lang: str) -> str:
    """Uses spaCy NER to remove personal information based on language."""
    if lang not in NLP_MODELS:
        return text  # Skip processing if language model is not available

    nlp = NLP_MODELS[lang]  # Select the correct model
    doc = nlp(text)

    # Replace personal entities with [REDACTED]
    for ent in doc.ents:
        if ent.label_ in PERSONAL_ENTITIES:
            text = text.replace(ent.text, "[REDACTED]")

    # Remove emails and phone numbers using regex (fallback)
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "[REDACTED]", text)  # Email
    text = re.sub(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b", "[REDACTED]", text)  # Phone Numbers

    return text

def anonymize_csv(input_csv: str, output_csv: str):
    """Reads a CSV file, anonymizes text based on language, and write to a new CSV."""
    try:
        df = pd.read_csv(input_csv)

        # Apply anonymization function to text column with respective language
        df["Text"] = df.apply(lambda row: remove_personal_info(str(row["Text"]), row["Language"]), axis=1)

        # Write anonymized data to csv
        df.to_csv(output_csv, index=False)
        logging.info(f"Anonymized data saved to {output_csv}")

    except Exception as e:
        logging.error(f"Error processing file: {e}")

# Run anonymization
anonymize_csv(CLEANED_CSV, GDPR_CSV)


## Using SpaCy NER to make the Model GDPR Compliant
- Remove any sensitive data and patterns, The GDPR data will be saved in a new csv file.

In [None]:
# Map languages to their respective spaCy models and stop words
LANGUAGE_MODELS = {
    'en': (NLP_MODELS['en'], EN_STOP_WORDS),
    'nl': (NLP_MODELS['nl'], NL_STOP_WORDS),
    'fr': (NLP_MODELS['fr'], FR_STOP_WORDS)
}

# Function to clean text using spaCy
def clean_text(text, language):
    if language not in LANGUAGE_MODELS:
        return text  # Return the original text if the language is not supported

    nlp, stop_words = LANGUAGE_MODELS[language]
    doc = nlp(text)

    # Lemmatize, remove stop words, and filter out short tokens
    cleaned_tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_stop  # Remove stop words
        and not token.is_punct  # Remove punctuation
        and not token.is_space  # Remove spaces
        and len(token.text) > 3  # Remove short tokens
    ]

    return " ".join(cleaned_tokens)

# Load the CSV file 
df_gdpr = pd.read_csv(CLEANED_CSV)

# Clean the text in the DataFrame
df_gdpr["Cleaned_Text"] = df_gdpr.apply(lambda row: clean_text(row["Text"], row["Language"]), axis=1)

# Drop the original "Text" column
df_gdpr.drop(columns=["Text"], inplace=True)

# Save the cleaned DataFrame to a new CSV file
df_gdpr.to_csv(CLEANED_CSV, index=False)
print(f"Cleaned data saved to {CLEANED_CSV}")
df_gdpr.head(30)

# 4. Initialize and Fit BERTopic
The good thing with BERTopic is that is does most of the work automatically (Meaning, I do not need to bore you to death with details about how it works behind te scenes.)

We need to do 3 things
1. Initialize BERTopic model
2. 'Fit' the model -> this  means: run the model, as you would run a simple linear regression
3. Look at the topics via 

To get started, let's just use the default settings.

# 5. Visualize Topics
- Visualize Topic Hierarchy
- Visualize documents
- Visualize topics full article