# 1. Import Necessary Libraries

Make sure  Microsoft Visual C++ is installed on your pc

Extracting text from pdf and converting to csv

In [1]:
"""
Needed dependancies:
    - spacy
    - pdfplumber
    - bertopic
    - pandas
    - numpy
    - matplotlib
    - wordcloud
    - nltk
    - gensim
    - scikit-learn
    - scipy
    - tqdm
    - umap-learn
    - hdbscan
    - sentence-transformers
    - transformers
    - torch
    - tensorflow
    - tensorflow_hub
    - tensorflow_text
pip install spacy pdfplumber bertopic
"""

'\nNeeded dependancies:\n    - spacy\n    - pdfplumber\n    - bertopic\n    - pandas\n    - numpy\n    - matplotlib\n    - wordcloud\n    - nltk\n    - gensim\n    - scikit-learn\n    - scipy\n    - tqdm\n    - umap-learn\n    - hdbscan\n    - sentence-transformers\n    - transformers\n    - torch\n    - tensorflow\n    - tensorflow_hub\n    - tensorflow_text\npip install spacy pdfplumber bertopic\n'

In [2]:
import pdfplumber
import pandas as pd
import os
import re
from collections import Counter
import nltk
from wordsegment import load, segment
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm  # For progress tracking
from langdetect import detect  # For language detection
import spacy  # For Dutch and French tokenization
import bertopic # For topic modeling

ModuleNotFoundError: No module named 'spacy'

In [None]:
# Ensure nltk sentence tokenizer is downloaded
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

# Load English word segmentation model
load()

# Load spaCy models for Dutch and French
nlp_nl = spacy.load("nl_core_news_sm")  # Dutch
nlp_fr = spacy.load("fr_core_news_sm")  # French

# Folder containing the PDFs
pdf_folder = r"Studies"
output_csv_path = r"csv/all_cleaned_pdf.csv"

# Compile regex patterns for efficiency
EXTRA_SPACES_PATTERN = re.compile(r"\s{2,}")
PAGE_NUM_PATTERN = re.compile(r"Page \d+")
LINE_BREAKS_PATTERN = re.compile(r"\n+")

# Function to detect language of a text
def detect_language(text):
    try:
        return detect(text)
    except:
        return "en"  # Default to English if detection fails

# Function to split long merged words into meaningful words (English only)
def split_long_words(text, language="en"):
    if not isinstance(text, str) or language != "en":
        return text  # Return as is if not a string or not English
    
    words = text.split()  # Split text into individual words
    processed_text = []

    for word in words:
        if len(word) > 10:  # Threshold for long words
            segmented_word = " ".join(segment(word))  # Use wordsegment to break into words
            processed_text.append(segmented_word)
        else:
            processed_text.append(word)
    
    return " ".join(processed_text)

# Function to tokenize sentences based on language
def tokenize_sentences(text, language):
    if language == "en":
        return sent_tokenize(text)  # Use nltk for English
    elif language == "nl":
        doc = nlp_nl(text)  # Use spaCy for Dutch
        return [sent.text for sent in doc.sents]
    elif language == "fr":
        doc = nlp_fr(text)  # Use spaCy for French
        return [sent.text for sent in doc.sents]
    else:
        return sent_tokenize(text)  # Default to nltk for other languages

# Function to extract and clean sentences from a PDF
def extract_clean_sentences_from_pdf(pdf_path):
    text_data = []
    headers = Counter()
    footers = Counter()
    filename = os.path.basename(pdf_path)  # Extract file name

    with pdfplumber.open(pdf_path) as pdf:
        # Detect common headers/footers
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                lines = text.split("\n")
                if len(lines) > 2:
                    headers[lines[0]] += 1  # First line as potential header
                    footers[lines[-1]] += 1  # Last line as potential footer

        # Identify the most common header/footer
        common_header = headers.most_common(1)[0][0] if headers else ""
        common_footer = footers.most_common(1)[0][0] if footers else ""

        # Extract and clean text
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()

            if text:
                lines = text.split("\n")

                # Remove detected headers and footers
                if len(lines) > 2:
                    if lines[0] == common_header:
                        lines.pop(0)  # Remove header
                    if lines[-1] == common_footer:
                        lines.pop(-1)  # Remove footer

                # Join cleaned lines back into text
                cleaned_text = "\n".join(lines)

                # Further cleanup: remove excessive spaces, page numbers, and metadata
                cleaned_text = EXTRA_SPACES_PATTERN.sub(" ", cleaned_text)  # Remove extra spaces
                cleaned_text = PAGE_NUM_PATTERN.sub("", cleaned_text)  # Remove page numbers
                cleaned_text = LINE_BREAKS_PATTERN.sub(" ", cleaned_text)  # Remove extra line breaks

                # Detect language of the text
                language = detect_language(cleaned_text)

                # Tokenize into sentences using language-specific tokenizer
                sentences = tokenize_sentences(cleaned_text.strip(), language)

                # Save each sentence as a separate row with filename
                for sentence in sentences:
                    cleaned_sentence = split_long_words(sentence, language)  # Apply word segmentation (English only)
                    text_data.append({
                        "filename": filename,
                        "Page": page_num + 1,
                        "language": language,
                        "sentence": cleaned_sentence
                    })

    return text_data

# Function to process a single PDF and return its data
def process_pdf(pdf_file):
    pdf_path = os.path.join(pdf_folder, pdf_file)
    try:
        return extract_clean_sentences_from_pdf(pdf_path)
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        return []

# Function to process all PDFs in the folder and save to CSV
def process_all_pdfs(pdf_folder, output_csv_path):
    all_text_data = []

    # Get list of PDF files
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

    # Use multiprocessing to process PDFs in parallel
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(process_pdf, pdf_file): pdf_file for pdf_file in pdf_files}
        for future in tqdm(as_completed(futures), total=len(pdf_files), desc="Processing PDFs"):
            all_text_data.extend(future.result())

    # Save to a single CSV file
    df = pd.DataFrame(all_text_data)

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"All PDFs processed! CSV saved at: {output_csv_path}")

# Run the function to process all PDFs
process_all_pdfs(pdf_folder, output_csv_path)

# Load CSV and display first few rows
df = pd.read_csv(output_csv_path)

# Display the cleaned DataFrame
print(" Extracted and Cleaned Sentences DataFrame:")
print(df.head())  # Display the first few rows of cleaned sentences

# 2.  Load Your Data

Load the articles from your CSV file using pandas. 

In [None]:
# Load the data
df= pd.read_csv(r'csv/all_cleaned_pdf.csv')
df.head()