In [None]:
# This Python script converts PDF to text for all files in a folder. 
# It truncates content from the "References" section and saves the text as a DOCX file.
# It helps users extract content from PDF documents, eliminate unwanted sections, and save the processed text for analysis or documentation.

# Install required libraries 
!pip install PyPDF2 python-docx

# Import needed libraries
import os
import re
import PyPDF2
from docx import Document
from docx.shared import Inches

# Function to read text from a PDF file
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to truncate text before the "References" section
def truncate_text_from_references(text):
    references_matches = list(re.finditer(r'\bReferences\b', text, re.IGNORECASE))
    if references_matches:
        last_references_start = references_matches[-1].start()
        truncated_text = text[:last_references_start]
    else:
        truncated_text = text
    return truncated_text


# Function to remove formulas enclosed within '$' signs
def remove_formulas(text):
    cleaned_text = re.sub(r'\$.*?\$', '', text)
    return cleaned_text

# Function to preprocess text and remove non-XML compatible characters
def preprocess_text(text):
    clean_text = ''.join([char if (ord(char) < 128 and ord(char) >= 32) else ' ' for char in text])
    return clean_text

# Function to save text as a DOCX file
def save_as_docx(text, output_file):
    doc = Document()
    doc.add_heading('Truncated Text', 0)
    doc.add_paragraph(text)
    doc.save(output_file)

# Input and output folder paths
input_folder = r'D:\other\knowing\amoozesh\ML\2-Proj\Pdf to Text'  # Replace with your input folder path
output_folder = os.path.join(input_folder, 'output')  # Define output folder in the input folder. Replace with your output folder path

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process each PDF file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.pdf'):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename.replace('.pdf', '.docx'))

        pdf_text = read_pdf(input_file_path)
        truncated_text = truncate_text_from_references(pdf_text)
        cleaned_text = remove_formulas(truncated_text)
        cleaned_text = preprocess_text(cleaned_text)

        save_as_docx(cleaned_text, output_file_path)
        print(f"Processed {filename} and saved as {output_file_path}")

print("All files processed and saved successfully.")