# 1. Import Necessary Libraries

Make sure  Microsoft Visual C++ is installed on your pc

Extracting text from pdf and converting to csv

In [None]:
"""
# PDF Text Extraction and Topic Modeling

This program extracts text from PDFs, cleans it, and performs topic modeling using BERTopic.

Process steps
1. Extract text from either a single or multiple research study PDFs. Path is 'studies/papers'
2. Clean it using spaCy with language support for english, dutch and french.
3. The cleaned data will be saved to 'csv/all_cleaned_pdf.csv'.
4. Take the data from the the csv and put into a dataframe.
5. Remove any sensitive data and patterns
6. Initialize and fit BERTopic
7. Visualize Topics
8. Visualize Topic Hierarchy
9. Visualize documents
10. Visualize topics full article


## Requirements
python -m spacy download fr_core_web_sm
python -m spacy download nl_core_web_md
python -m spacy download en_core_web_md
Use the following Python version 3.12.0 otherwise the code will not work due to spacy.
"""

'\n# PDF Text Extraction and Topic Modeling\n\nThis program extracts text from PDFs, cleans it, and performs topic modeling using BERTopic.\n\n## Dependencies\n- pdfplumber\n- pandas\n- nltk\n- spacy\n- bertopic\n- langdetect\n- wordsegment\n\n## Usage\n1. Place your PDFs in the `papers` folder.\n2. Run the script to extract, clean, and analyze the text.\n3. The cleaned data will be saved to `csv/all_cleaned_pdf.csv`.\n4. Topic modeling results will be visualized using BERTopic.\n\n\npip install pdfplumber bertopic nltk pandas spacy numpy matplotlib wordcloud gensim scikit-learn scipy tqdm wordsegment langdetect\npython -m spacy download fr_core_web_sm\npython -m spacy download nl_core_web_md\npython -m spacy download en_core_web_md\nUse the following Python version 3.12.0 otherwise the code will not work due to spacy.\n'

In [None]:
import csv
import logging
import os
import re
from collections import Counter
from multiprocessing import Lock
from typing import List, Dict, Any

import bertopic  # For topic modeling
import pandas as pd # For data manipulation
import fitz # For PDF text extraction
import spacy  # For Dutch and French tokenization
from langdetect import detect, LangDetectException  # For language detection
from tqdm import tqdm  # For progress tracking
from wordsegment import load, segment  # For word segmentation

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load English word segmentation model
load()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Paths
PDF_DIRECTORY = "papers"  # Replace with the path to your PDF folder
OUTPUT_CSV = "csv/cleanedpdfsV5.csv"  # Replace with the desired output CSV file name

# Validate directory existence
if not os.path.exists(PDF_DIRECTORY):
    logging.error(f"PDF directory does not exist: {PDF_DIRECTORY}")
    exit(1)

# Load spaCy models for efficiency
NLP_MODELS = {
    'en': spacy.load("en_core_web_lg"),
    'nl': spacy.load("nl_core_news_lg"),
    'fr': spacy.load("fr_core_news_lg")
}

def validate_pdf_path(pdf_path: str) -> bool:
    """Validate that the PDF path exists and is readable."""
    if not os.path.exists(pdf_path) or not os.access(pdf_path, os.R_OK):
        logging.error(f"PDF file is not accessible: {pdf_path}")
        return False
    return True

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file using PyMuPDF."""
    try:
        with fitz.open(pdf_path) as doc:
            return " ".join(page.get_text() for page in doc).strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_language(text: str) -> str:
    """Detect the language of the text using langdetect."""
    try:
        language = detect(text)
        return language if language in NLP_MODELS else 'en'
    except LangDetectException:
        logging.error("Language detection failed. Defaulting to 'en'.")
        return 'en'

def clean_text(text: str, language: str) -> str:
    """Clean and tokenize text using spaCy."""
    nlp = NLP_MODELS.get(language, NLP_MODELS['en'])
    doc = nlp(text)
    return " ".join(token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct)

def split_long_words(text: str) -> str:
    """Split long words into meaningful segments using wordsegment."""
    return " ".join(" ".join(segment(word)) if len(word) > 10 else word for word in text.split())

def extract_clean_sentences(pdf_path: str) -> List[Dict[str, str]]:
    """Extract and clean sentences from a PDF using PyMuPDF and spaCy."""
    text_data = []
    filename = os.path.basename(pdf_path)
    headers, footers = Counter(), Counter()
    
    with fitz.open(pdf_path) as doc:
        for page in doc:
            lines = page.get_text().split("\n")
            if len(lines) > 2:
                headers[lines[0]] += 1
                footers[lines[-1]] += 1

        common_header = headers.most_common(1)[0][0] if headers else ""
        common_footer = footers.most_common(1)[0][0] if footers else ""

        for page_num, page in enumerate(doc, start=1):
            lines = page.get_text().split("\n")
            if len(lines) > 2:
                if lines[0] == common_header:
                    lines.pop(0)
                if lines[-1] == common_footer:
                    lines.pop(-1)

            cleaned_text = re.sub(r"\s{2,}", " ", " ".join(lines))
            cleaned_text = re.sub(r"Page \d+", "", cleaned_text)
            sentences = cleaned_text.strip().split('.')
            
            for sentence in sentences:
                cleaned_sentence = split_long_words(sentence.strip())
                if cleaned_sentence:
                    text_data.append({
                        "filename": filename, 
                        "Page": page_num, 
                        "sentence": cleaned_sentence,
                        })
    
    return text_data

def process_pdfs(pdf_paths: List[str]) -> List[Dict[str, str]]:
    """Process multiple PDFs sequentially."""
    results = []
    for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
        if validate_pdf_path(pdf_path):
            results.extend(extract_clean_sentences(pdf_path))
    return results

def write_to_csv(data: List[Dict[str, str]], output_csv: str) -> None:
    """Write cleaned text data to a CSV file."""
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["filename", "Page", "sentence"])
        writer.writeheader()
        writer.writerows(data)
    logging.info(f"Saved cleaned data to {output_csv}")

# Get list of PDF paths
pdf_paths = [os.path.join(PDF_DIRECTORY, file) for file in os.listdir(PDF_DIRECTORY) if file.endswith(".pdf")]

# Process PDFs and extract text
cleaned_texts = process_pdfs(pdf_paths)

# Write cleaned text to CSV
if cleaned_texts:
    write_to_csv(cleaned_texts, OUTPUT_CSV)
    df = pd.read_csv(OUTPUT_CSV)
    print(df.head(30))
else:
    logging.error("No cleaned text data to save.")

2025-02-24 01:58:47,364 - ERROR - The folder \papers does not exist.
2025-02-24 01:58:47,364 - ERROR - CSV file csv\all_cleaned_pdf.csv does not exist.


# 2.  Load Your Data

Load the articles from your CSV file using pandas. 

In [None]:
# Load the data
df = pd.read_csv(OUTPUT_CSV)
df.head(30)

FileNotFoundError: [Errno 2] No such file or directory: 'csv/all_cleaned_pdf.csv'

# 3. Prepare Your Text Data
We clean up the text
- Remove the name of city, country, geography for better outcome
- Remove special characters (only letters)
- Convert to lower case
- Remove stop words
- Remove words of only one or 2 letters ('a', 'I', at,...)
- Remove very short sentences
- Remove urls 
- use stemming
- do duplicate sentences

In [None]:
# Precompiled regex patterns for faster matching
PATTERNS = {
    "EMAIL": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
    "URL": re.compile(r'\bhttps?://\S+\b'),
    "PHONE": re.compile(r'\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,3}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}\b'),
    "DATE": [
        re.compile(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b'),  # yyyy-mm-dd, dd-mm-yyyy, etc.
        re.compile(r'\b\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{2,4}\b')  # 1 Jan 2023
    ]
}

# Named Entity categories to redact
SENSITIVE_ENTITIES = {"PER", "DATE", "GPE", "LOC", "FAC", "ORG"}

def contains_sensitive_info(text, language):
    """Check if text contains sensitive information."""
    if not text:
        return False
    
    nlp = nlp_models.get(language, nlp_models['en'])
    doc = nlp(text)
    
    # Check for sensitive entities using spaCy's NER
    if any(ent.label_ in SENSITIVE_ENTITIES for ent in doc.ents):
        return True
    
    # Check for sensitive patterns using regex
    for key, pattern in PATTERNS.items():
        if isinstance(pattern, list):
            if any(sub_pattern.search(text) for sub_pattern in pattern):
                return True
        else:
            if pattern.search(text):
                return True
    
    return False

def remove_sensitive_info(text, language):
    """Remove sensitive information from the text."""
    if not text:
        return text
    
    nlp = nlp_models.get(language, nlp_models['en'])
    doc = nlp(text)
    
    # Redact sensitive entities
    redacted_text = []
    for token in doc:
        if token.ent_type_ in SENSITIVE_ENTITIES:
            redacted_text.append("[REDACTED]")
        else:
            redacted_text.append(token.text)
    
    redacted_text = ' '.join(redacted_text)
    
    # Redact sensitive patterns using regex
    for key, pattern in PATTERNS.items():
        if isinstance(pattern, list):
            for sub_pattern in pattern:
                redacted_text = sub_pattern.sub("[REDACTED]", redacted_text)
        else:
            redacted_text = pattern.sub("[REDACTED]", redacted_text)
    
    return redacted_text

# Example usage with a DataFrame
df['Contains_Sensitive_Info'] = df['Cleaned_Text'].apply(lambda x: contains_sensitive_info(x, 'en'))
df['Redacted_Text'] = df['Cleaned_Text'].apply(lambda x: remove_sensitive_info(x, 'en'))
df.head(25)

In [None]:
# Load spaCy's English NER model
nlp = spacy.load('en_core_web_sm')

# Function to remove geographic entities (cities, countries, locations)
def remove_geographical_entities(text):
    if not isinstance(text, str):
        return ""  # Handle missing or non-string values
    
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ not in ["GPE", "LOC", "FAC"]]
    
    return " ".join(filtered_tokens)

# Apply function to remove cities, countries, and geography
df['sentence_clean'] = df['sentence'].apply(remove_geographical_entities)

# Display a few cleaned sentences
df.head()

In [None]:
# Load the spaCy model (use 'en_core_web_sm' for small model)
nlp = spacy.load('en_core_web_sm')

# Minimum word length threshold
MIN_WORD_SIZE = 4

# Preprocessing function to clean sentences
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Handle missing or non-string values
    
    # Remove URLs (http, https, www, etc.)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Normalize Unicode characters and remove diacritics
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    
    # Replace non-alphabetic characters with a space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Process the text with spaCy
    doc = nlp(text.lower())

    # Extract tokens, remove stopwords, and apply lemmatization
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop  # Remove stopwords
        and not token.is_punct  # Remove punctuation
        and not token.is_space  # Remove spaces
        and len(token.lemma_) >= MIN_WORD_SIZE  # Remove short words
    ]

    # Remove duplicate words within each sentence (preserve order)
    unique_tokens = list(dict.fromkeys(tokens))

    # Ensure proper spacing between words
    return " ".join(unique_tokens)

# Apply preprocessing function to clean sentences
df['sentence_clean'] = df['sentence_clean'].apply(preprocess_text)

# Remove duplicate sentences (keep one instance of each)
df = df.drop_duplicates(subset=['sentence_clean'])

In [None]:
# Display the first element of the cleaned text
df.head()

In [None]:
df['sentence_clean'].is_unique

In [None]:
df = df[df['sentence_clean'].str.len() >= 15]

In [None]:
df.info()

### To see how data cleaning looks 

In [None]:
# Define the output path
output_path = r"cleaned_data.csv"

# Save the cleaned DataFrame to CSV
df.to_csv(output_path, index=False, encoding='utf-8')

print(f" Cleaned DataFrame saved to: {output_path}")

# 4. Initialize and Fit BERTopic
The good thing with BERTopic is that is does most of the work automatically (Meaning, I do not need to bore you to death with details about how it works behind te scenes.)

We need to do 3 things
1. Initialize BERTopic model
2. 'Fit' the model -> this  means: run the model, as you would run a simple linear regression
3. Look at the topics via 

To get started, let's just use the default settings.