In [None]:
import csv
import logging
import os
import re
from collections import Counter
from multiprocessing import Lock
from typing import List, Dict, Any
import argparse

import bertopic  # For topic modeling
import pandas as pd # For data manipulation
import fitz # For PDF text extraction
import spacy  # For Dutch and French tokenization
from langdetect import detect, LangDetectException  # For language detection
from tqdm import tqdm  # For progress tracking
from wordsegment import load, segment  # For word segmentation

# Path to your directory containing the PDFs
doc_dir = './papers'

# List to store the blocks of text (as individual records)
data = []

# Function to check if a line is a heading (all uppercase or starts with 'CHAPTER')
def is_heading(line):
    return line.isupper() or line.startswith('CHAPTER')

# Function to check if a line is a footnote (starts with number in brackets, number, or asterisk)
def is_footnote(line):
    return re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line) or line.startswith('*') or line.startswith('Note') or line.startswith('Table') 

# Function to count words in a block of text
def count_words(text):
    return len(text.split())

# Function to filter out lines containing DOI, URLs, specific keywords, or phrases
def contains_doi_or_https(line):
    return ('doi' in line.lower() or 
            'https' in line.lower() or 
            'http' in line.lower() or 
            'journal' in line.lower() or 
            'university' in line.lower() or 
            'brookville' in line.lower() or
            'to cite this article' in line.lower() or
            'full terms & conditions' in line.lower() or
            'taylor & francis' in line.lower() or
            'elsevier' in line.lower() or
            'published by' in line.lower() or
            'received' in line.lower() or
            'revised' in line.lower() or
            'author(s)' in line.lower() or
            'source:' in line.lower() or
            'history:' in line.lower() or
            'keywords' in line.lower() or
            'vol.' in line.lower() or 
            'volume' in line.lower() or 
            'downloaded' in line.lower() or    
            'article' in line.lower() or
            'creative commons use' in line.lower() or
            'author' in line.lower() or 
            'copyrighted' in line.lower() or
            'quarterly' in line.lower() or
            'journal' in line.lower() or
            'purtell' in line.lower() or
            'resources:' in line.lower() or
            'publisher' in line.lower() or
            'ying' in line.lower() or
            'cincinnati' in line.lower() or
            'ISSN' in line.lower() or
            'All rights reserved' in line.lower() or
            'authors' in line.lower())

# Function to check if a line is part of the reference or acknowledgements section
def is_reference_or_acknowledgements_section(line):
    reference_markers = ['references', 'bibliography', 'acknowledgements', 'nederlandse', 'method',"methods"]
    return any(marker in line.lower() for marker in reference_markers)

# Function to replace ligatures with their individual characters
def replace_ligatures(text):
    ligatures = {
        'ﬁ': 'fi',
        'ﬂ': 'fl',
        'ﬃ': 'ffi',
        'ﬄ': 'ffl',
        'ﬀ': 'ff',
        'ﬂ': 'fl',
    }
    for ligature, replacement in ligatures.items():
        text = text.replace(ligature, replacement)
    return text

# Function to fix common word splits
def fix_common_word_splits(text):
    common_fixes = {
        'signi ficant': 'significant',
        'di fferent': 'different',
        'e ffective': 'effective',
        'e ffect': 'effect',
        'chil dren': 'children',
        'e ff ective': 'effective',
        'con fi dence': 'confidence',
    }
    for split_word, correct_word in common_fixes.items():
        text = text.replace(split_word, correct_word)
    
    text = re.sub(r'\b(\w{3,})\s+(\w{3,})\b', r'\1 \2', text)  # Adjust spaces if needed
    return text

# Loop through each file in the directory
for filename in os.listdir(doc_dir):
    if filename.endswith('.pdf'):  # Only process PDF files
        file_path = os.path.join(doc_dir, filename)

        # Extract the title of the PDF (filename without the '.pdf' extension)
        title = os.path.splitext(filename)[0]

        # Open the PDF file using PyMuPDF
        pdf_document = fitz.open(file_path)

        # Flag to indicate if we are in the reference or acknowledgements section for the entire document
        section_reached = False

        # Iterate through each page in the PDF
        for page_num in range(pdf_document.page_count):
            if section_reached:
                break  # Stop processing further pages if the section marker was reached

            page = pdf_document.load_page(page_num)  # Load a page by page number
            text_dict = page.get_text("dict")  # Extract text in dictionary format to preserve layout
            
            # Substitute all semicolons (;) with commas (,)
            for block in text_dict["blocks"]:
                if block["type"] == 0:  # Type 0 is a text block
                    for line in block["lines"]:
                        for span in line["spans"]:
                            span["text"] = span["text"].replace(';', ',')
            
            # Process each block of text on the page
            for block in text_dict["blocks"]:
                if block["type"] == 0:  # Type 0 is a text block
                    block_text = ""
                    prev_x = None  # To store the previous x-coordinate (indentation level)
                    paragraph = []  # List to store lines that belong to the same paragraph

                    for line in block["lines"]:
                        # Get the text from the line
                        line_text = " ".join([span["text"] for span in line["spans"]])

                        # Apply ligature replacement and common word fixes
                        line_text = replace_ligatures(line_text)
                        line_text = fix_common_word_splits(line_text)

                        # **Immediately stop processing if the reference/acknowledgements section is detected**
                        if is_reference_or_acknowledgements_section(line_text):
                            section_reached = True
                            break  # Exit the inner loop and stop processing this file
                        
                        # Skip if it's a header, footnote, contains DOI/URL, or matches the title of the PDF
                        if is_heading(line_text) or is_footnote(line_text) or contains_doi_or_https(line_text) or line_text.strip().lower() == title.lower():
                            continue

                        # Get the x-coordinate (horizontal position of the first word in the line)
                        first_word_x = line["spans"][0]["bbox"][0]

                        # Check if the line belongs to the same paragraph (by horizontal position)
                        if prev_x is None or first_word_x - prev_x < 10:  # If the line's x is close to the previous, it's part of the same paragraph
                            paragraph.append(line_text)
                        else:
                            # When indentation changes significantly, treat this as the start of a new paragraph
                            if paragraph:  # If there's already accumulated text, store it as a block
                                full_paragraph_text = " ".join(paragraph).strip()
                                if count_words(full_paragraph_text) >= 10:  # Skip blocks with less than 10 words
                                    data.append([filename, page_num + 1, full_paragraph_text])
                            paragraph = [line_text]  # Start a new paragraph

                        prev_x = first_word_x  # Update the previous x-coordinate

                    # If section_reached is True after breaking, break the outer loop as well
                    if section_reached:
                        break

                    # If there's any accumulated paragraph, add it to the data
                    if paragraph and not section_reached:
                        full_paragraph_text = " ".join(paragraph).strip()
                        if count_words(full_paragraph_text) >= 10:  # Skip blocks with less than 10 words
                            data.append([filename, page_num + 1, full_paragraph_text])

# Convert the data to a DataFrame (optional)
df = pd.DataFrame(data, columns=["File", "Page", "text"])

# Print the first few records
print(df.head())


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3044457717.py, line 19)