# 1. Import Necessary Libraries

Make sure  Microsoft Visual C++ is installed on your pc

Extracting text from pdf and converting to csv

In [None]:
"""
# PDF Text Extraction and Topic Modeling

This program extracts text from PDFs, cleans it, and performs topic modeling using BERTopic.

## Usage
1. Place your PDFs in the `papers` folder.
2. Run the script to extract, clean, and analyze the text.
3. The cleaned data will be saved to `csv/all_cleaned_pdf.csv`.
4. Topic modeling results will be visualized using BERTopic.


## Requirements
python -m spacy download fr_core_web_sm
python -m spacy download nl_core_web_md
python -m spacy download en_core_web_md
Use the following Python version 3.12.0 otherwise the code will not work due to spacy.
"""

'\n# PDF Text Extraction and Topic Modeling\n\nThis program extracts text from PDFs, cleans it, and performs topic modeling using BERTopic.\n\n## Dependencies\n- pdfplumber\n- pandas\n- nltk\n- spacy\n- bertopic\n- langdetect\n- wordsegment\n\n## Usage\n1. Place your PDFs in the `papers` folder.\n2. Run the script to extract, clean, and analyze the text.\n3. The cleaned data will be saved to `csv/all_cleaned_pdf.csv`.\n4. Topic modeling results will be visualized using BERTopic.\n\n\npip install pdfplumber bertopic nltk pandas spacy numpy matplotlib wordcloud gensim scikit-learn scipy tqdm wordsegment langdetect\npython -m spacy download fr_core_web_sm\npython -m spacy download nl_core_web_md\npython -m spacy download en_core_web_md\nUse the following Python version 3.12.0 otherwise the code will not work due to spacy.\n'

In [None]:
import csv
import logging
import os
import re
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Lock
from typing import List, Dict, Any

import bertopic  # For topic modeling
import nltk
import pandas as pd
import pdfplumber
import spacy  # For Dutch and French tokenization
from langdetect import detect, LangDetectException  # For language detection
from tqdm import tqdm  # For progress tracking
from wordsegment import load, segment

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import logging
import pdfplumber
import pandas as pd
import spacy
from langdetect import detect, LangDetectException
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Paths
pdf_directory = r'papers'  # Replace with the path to your PDF folder
output_csv = r'csv/cleaned.csv'  # Replace with the desired output CSV file name

# Validate directory existence
if not os.path.exists(pdf_directory):
    logging.error(f"PDF directory does not exist: {pdf_directory}")
    exit(1)

# Load spaCy models for efficiency
nlp_models = {
    'en': spacy.load("en_core_web_lg"),
    'nl': spacy.load("nl_core_news_lg"),
    'fr': spacy.load("fr_core_news_lg")
}

def validate_pdf_path(pdf_path):
    """Validate that the PDF path exists and is readable."""
    if not os.path.exists(pdf_path):
        logging.error(f"PDF file does not exist: {pdf_path}")
        return False
    if not os.access(pdf_path, os.R_OK):
        logging.error(f"PDF file is not readable: {pdf_path}")
        return False
    return True

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using pdfplumber."""
    try:
        text = ''
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + " "
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting {pdf_path}: {e}")
        return ""

def detect_language(text):
    """Detect the language of the text using langdetect."""
    try:
        language = detect(text)
        if language not in nlp_models:
            logging.warning(f"Unsupported language detected ({language}), defaulting to English.")
            return 'en'
        return language
    except LangDetectException as e:
        logging.error(f"Language detection error: {e}")
        return 'en'

def clean_text_with_spacy(text, language):
    """Efficiently clean text using spaCy."""
    try:
        nlp = nlp_models.get(language, nlp_models['en'])
        doc = nlp(text)  # Direct processing
        return ' '.join(token.lemma_ for token in doc if not token.is_stop and not token.is_punct)
    except Exception as e:
        logging.error(f"Error cleaning text: {e}")
        return ""

def process_pdfs(pdf_paths):
    """Process multiple PDFs sequentially."""
    results = []
    for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
        try:
            result = process_pdf(pdf_path)
            if result:
                results.append(result)
        except Exception as e:
            logging.error(f"Error processing PDF: {e}")
    return results

def process_pdf(pdf_path):
    """Extract, detect language, and clean text from a single PDF."""
    if not validate_pdf_path(pdf_path):
        return None  # Skip invalid PDFs
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return None  # Skip empty PDFs
    language = detect_language(text)
    cleaned_text = clean_text_with_spacy(text, language)
    return cleaned_text

def write_to_csv(data, output_csv):
    """Write cleaned text data to a CSV file."""
    try:
        df = pd.DataFrame(data, columns=['Cleaned_Text'])
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)  # Ensure directory exists
        df.to_csv(output_csv, index=False, encoding='utf-8')
        logging.info(f"Saved cleaned data to {output_csv}")
    except Exception as e:
        logging.error(f"Error writing to CSV: {e}")

# Process PDFs and write to CSV
pdf_paths = [os.path.join(pdf_directory, file) for file in os.listdir(pdf_directory) if file.endswith(".pdf")]
cleaned_texts = process_pdfs(pdf_paths)
if cleaned_texts:
    write_to_csv(cleaned_texts, output_csv)
else:
    logging.error("No cleaned text data to save.")


2025-02-24 01:58:47,364 - ERROR - The folder \papers does not exist.
2025-02-24 01:58:47,364 - ERROR - CSV file csv\all_cleaned_pdf.csv does not exist.


# 2.  Load Your Data

Load the articles from your CSV file using pandas. 

In [None]:
# Load the data
df= pd.read_csv(r'csv/cleaned.csv')
df.head(100)

FileNotFoundError: [Errno 2] No such file or directory: 'csv/all_cleaned_pdf.csv'

### Removing any personal information to anonymize data  

In [None]:
# Load spaCy model for Named Entity Recognition (NER).
# But because we are using the large English model, it is already loaded in the previous cell.
# We will check again the language of the text and use the appropriate model for NER.

def remove_sensitive_info(text):
    """
    Remove personal information such as names, dates, and numbers from the text.
    Supports English, Dutch, and French languages.
    
    Args:
        text (str): The input text containing potential personal information.
    
    Returns:
        str: The cleaned text with personal information removed.
    """
    if not isinstance(text, str):
        return ""  # Handle non-string values

    # Detect the language of the text
    language = detect_language(text)

    # Load the appropriate spaCy model based on the detected language
    if language == 'en':
        nlp = nlp_en
    elif language == 'nl':
        nlp = nlp_nl
    elif language == 'fr':
        nlp = nlp_fr
    else:
        nlp = nlp_en  # Default to English if the language is not supported

    # Process the text with spaCy
    doc = nlp(text)

    # Remove PERSON names using spaCy NER
    words = [token.text for token in doc if token.ent_type_ != "PER"]

    # Join the remaining words into a single string
    cleaned_text = " ".join(words)

    # Remove dates and numbers using regex
    cleaned_text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '', cleaned_text)  # Remove dates (YYYY-MM-DD, DD/MM/YYYY)
    cleaned_text = re.sub(r'\b\d+\b', '', cleaned_text)  # Remove standalone numbers

    return cleaned_text.strip()


# 3. Prepare Your Text Data
We clean up the text
- Remove the name of city, country, geography for better outcome
- Remove special characters (only letters)
- Convert to lower case
- Remove stop words
- Remove words of only one or 2 letters ('a', 'I', at,...)
- Remove very short sentences
- Remove urls 
- use stemming
- do duplicate sentences

In [None]:
# Load spaCy's English NER model
nlp = spacy.load('en_core_web_sm')

# Function to remove geographic entities (cities, countries, locations)
def remove_geographical_entities(text):
    if not isinstance(text, str):
        return ""  # Handle missing or non-string values
    
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ not in ["GPE", "LOC", "FAC"]]
    
    return " ".join(filtered_tokens)

# Apply function to remove cities, countries, and geography
df['sentence_clean'] = df['sentence'].apply(remove_geographical_entities)

# Display a few cleaned sentences
df.head()

In [None]:
# Load the spaCy model (use 'en_core_web_sm' for small model)
nlp = spacy.load('en_core_web_sm')

# Minimum word length threshold
MIN_WORD_SIZE = 4

# Preprocessing function to clean sentences
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Handle missing or non-string values
    
    # Remove URLs (http, https, www, etc.)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Normalize Unicode characters and remove diacritics
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    
    # Replace non-alphabetic characters with a space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Process the text with spaCy
    doc = nlp(text.lower())

    # Extract tokens, remove stopwords, and apply lemmatization
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop  # Remove stopwords
        and not token.is_punct  # Remove punctuation
        and not token.is_space  # Remove spaces
        and len(token.lemma_) >= MIN_WORD_SIZE  # Remove short words
    ]

    # Remove duplicate words within each sentence (preserve order)
    unique_tokens = list(dict.fromkeys(tokens))

    # Ensure proper spacing between words
    return " ".join(unique_tokens)

# Apply preprocessing function to clean sentences
df['sentence_clean'] = df['sentence_clean'].apply(preprocess_text)

# Remove duplicate sentences (keep one instance of each)
df = df.drop_duplicates(subset=['sentence_clean'])

In [None]:
# Display the first element of the cleaned text
df.head()

In [None]:
df['sentence_clean'].is_unique

In [None]:
df = df[df['sentence_clean'].str.len() >= 15]

In [None]:
df.info()

### To see how data cleaning looks 

In [None]:
# Define the output path
output_path = r"cleaned_data.csv"

# Save the cleaned DataFrame to CSV
df.to_csv(output_path, index=False, encoding='utf-8')

print(f" Cleaned DataFrame saved to: {output_path}")

# 4. Initialize and Fit BERTopic
The good thing with BERTopic is that is does most of the work automatically (Meaning, I do not need to bore you to death with details about how it works behind te scenes.)

We need to do 3 things
1. Initialize BERTopic model
2. 'Fit' the model -> this  means: run the model, as you would run a simple linear regression
3. Look at the topics via 

To get started, let's just use the default settings.