In [3]:
import logging
import os
from datetime import datetime

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self):
        try:
            file = open('./test.pdf', 'rb')
            input_stream = file.read()
            file.close()

            # Initial setup, create credentials instance
            credentials = ServicePrincipalCredentials(
                client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
                client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
            )

            # Creates a PDF Services instance
            pdf_services = PDFServices(credentials=credentials)

            # Creates an asset(s) from source file(s) and upload
            input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)

            # Create parameters for the job
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)

            # Creates a new job instance
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            # Submit the job and gets the job result
            location = pdf_services.submit(export_pdf_job)
            pdf_services_response = pdf_services.get_job_result(location, ExportPDFResult)

            # Get content from the resulting asset(s)
            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = pdf_services.get_content(result_asset)

            # Creates an output stream and copy stream asset's content to it
            output_file_path = './test.docx'
            with open(output_file_path, "wb") as file:
                file.write(stream_asset.get_input_stream())

        except (ServiceApiException, ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while executing operation: {e}')

if __name__ == "__main__":
    ExportPDFToDOCX()


INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
ERROR:root:Exception encountered while executing operation: description =Thread interrupted while waiting for operation execution status!!, requestTrackingId=None
Traceback (most recent call last):
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\adobe\pdfservices\operation\internal\pdf_services_helper.py", line 145, in get_job_result
    time.sleep(retry_after)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\austi\AppData\Local\Temp\ipykernel_23684\3861851520.py", line 46, in __init__
    pdf_services_response = pdf_servic

In [8]:
import gradio as gr
import logging
import os
import tempfile
from datetime import datetime
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertTokenizerFast, BertForSequenceClassification

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self):
        try:
            file = open('./test.pdf', 'rb')
            input_stream = file.read()
            file.close()

            # Initial setup, create credentials instance
            credentials = ServicePrincipalCredentials(
                client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
                client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
            )

            # Creates a PDF Services instance
            pdf_services = PDFServices(credentials=credentials)

            # Creates an asset(s) from source file(s) and upload
            input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)

            # Create parameters for the job
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)

            # Creates a new job instance
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            # Submit the job and gets the job result
            location = pdf_services.submit(export_pdf_job)
            pdf_services_response = pdf_services.get_job_result(location, ExportPDFResult)

            # Get content from the resulting asset(s)
            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = pdf_services.get_content(result_asset)

            # Creates an output stream and copy stream asset's content to it
            output_file_path = './test.docx'
            with open(output_file_path, "wb") as file:
                file.write(stream_asset.get_input_stream())

        except (ServiceApiException, ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while executing operation: {e}')

    def convert_pdf_to_docx(self):
        try:
            # Existing code for opening the PDF file, reading content, and creating credentials
            # ...

            # Create parameters for the job, ensuring target format is DOCX
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)

            # Create a new job instance with the input asset and export parameters
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            # Submit the job and retrieve the result
            location = pdf_services.submit(export_pdf_job)
            pdf_services_response = pdf_services.get_job_result(location, ExportPDFResult)

            # Get content from the resulting asset and write it to a temporary DOCX file
            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = pdf_services.get_content(result_asset)

            # Create a temporary filename for the DOCX (ensure valid and accessible path)
            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            # Return the temporary DOCX file path
            return temp_file_path

        except (ServiceApiException, ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None  # Indicate failure by returning None

def clean_text_with_bert(text):
    # Load pre-trained BERT model and tokenizer
    model_name = "bert-base-uncased"  # You can change this to a different BERT model
    model = BertForSequenceClassification.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    # Prepare the input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Get model output
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()

    # Based on the predicted label, perform appropriate cleaning
    if predicted_label == 0:  # Assuming label 0 corresponds to "clean" text
        cleaned_text = text
    else:
        # Perform cleaning based on the predicted label (e.g., noise removal, spelling correction)
        # You can customize this logic based on your specific requirements
        cleaned_text = perform_cleaning(text)  # Replace with your cleaning function

    return cleaned_text

def perform_cleaning(text):
    # Example cleaning logic: remove stop words and punctuation
    stop_words = set(["the", "and", "in", "of", "to", "for", "with", "that", "it", "is", "as", "this", "be", "have", "not", "but", "by", "from", "at", "on", "or", "are"])
    cleaned_text = " ".join([word for word in text.split() if word not in stop_words and word.isalnum()])
    return cleaned_text

def process_pdf(pdf_file):
    # Convert PDF to DOCX
    exporter = ExportPDFToDOCX()
    docx_file = exporter.process()

    if docx_file is None:
        return "Error occurred during PDF to DOCX conversion."

    # Read DOCX content
    import docx2txt
    text = docx2txt.process(docx_file)

    # Clean text using BERT
    cleaned_text = clean_text_with_bert(text)

    # Clean up the temporary DOCX file (optional)
    # os.remove(docx_file)  # Uncomment if you want to delete the temporary file

    return cleaned_text

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.Textbox(label="Cleaned Document"),
    title="PDF Cleaner",
    description="Upload a PDF file to convert it to DOCX and clean the text using BERT."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7867/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7867/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Traceback (most recent call last):
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\gradio\queueing.py", line 622, in process_events
    response = await route_utils.call_process_api(
  

In [9]:
pip install gradio torch transformers pdfservices-sdk docx2txt


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            return temp_file_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def clean_text_with_bert(text):
    # Load pre-trained BERT model and tokenizer
    model_name = "bert-base-uncased"  # You can change this to a different BERT model
    model = BertForSequenceClassification.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    # Prepare the input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Get model output
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()

    # Based on the predicted label, perform appropriate cleaning
    if predicted_label == 0:  # Assuming label 0 corresponds to "clean" text
        cleaned_text = text
    else:
        # Perform cleaning based on the predicted label
        cleaned_text = perform_cleaning(text)

    return cleaned_text

def perform_cleaning(text):
    # Example cleaning logic: remove stop words and punctuation
    stop_words = set(["the", "and", "in", "of", "to", "for", "with", "that", "it", "is", "as", "this", "be", "have", "not", "but", "by", "from", "at", "on", "or", "are"])
    cleaned_text = " ".join([word for word in text.split() if word not in stop_words and word.isalnum()])
    return cleaned_text

def process_pdf(pdf_file):
    try:
        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf_path = temp_pdf.name
            # Write the content of the uploaded file to the temporary file
            if isinstance(pdf_file, str):
                # If pdf_file is a string (file path), write its contents
                with open(pdf_file, 'rb') as f:
                    temp_pdf.write(f.read())
            else:
                # If pdf_file is bytes or a file-like object, write it directly
                temp_pdf.write(pdf_file)
        
        # Convert PDF to DOCX
        exporter = ExportPDFToDOCX(temp_pdf_path)
        docx_file = exporter.process()

        if docx_file is None:
            return "An error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Read DOCX content
        import docx2txt
        text = docx2txt.process(docx_file)

        # Clean text using BERT
        cleaned_text = clean_text_with_bert(text)

        # Clean up temporary files
        os.remove(temp_pdf_path)
        os.remove(docx_file)

        return cleaned_text

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Cleaned Document"),
    title="PDF Cleaner",
    description="Upload a PDF file to convert it to DOCX and clean the text using BERT."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7871/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7871/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_rel

In [14]:
#### testing version
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            return temp_file_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def clean_text_with_bert(text):
    model_name = "bert-base-uncased"
    model = BertForSequenceClassification.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    # Split text into smaller chunks to fit BERT's max token limit
    max_length = tokenizer.model_max_length
    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]

    cleaned_chunks = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()

        if predicted_label == 0:  # Assuming label 0 corresponds to "clean" text
            cleaned_chunks.append(chunk)
        else:
            cleaned_chunks.append(perform_cleaning(chunk))

    return " ".join(cleaned_chunks)

def perform_cleaning(text):
    # Basic cleaning: remove extra whitespace and normalize punctuation
    cleaned_text = " ".join(text.split())
    cleaned_text = cleaned_text.replace(" ,", ",").replace(" .", ".").replace(" :", ":")
    return cleaned_text

def detect_formatting(text):
    lines = text.split('\n')
    formatted_lines = []
    for line in lines:
        if line.isupper():
            formatted_lines.append(('heading', line))
        elif line.strip().startswith(('•', '-', '*')):
            formatted_lines.append(('bullet', line))
        elif len(line.strip()) > 0:
            formatted_lines.append(('paragraph', line))
    return formatted_lines

def create_formatted_docx(text, output_path):
    doc = Document()
    
    # Define styles
    styles = doc.styles
    heading_style = styles.add_style('CustomHeading', WD_STYLE_TYPE.PARAGRAPH)
    heading_style.font.size = Pt(14)
    heading_style.font.bold = True
    
    paragraph_style = styles.add_style('CustomParagraph', WD_STYLE_TYPE.PARAGRAPH)
    paragraph_style.font.size = Pt(11)
    
    bullet_style = styles.add_style('CustomBullet', WD_STYLE_TYPE.PARAGRAPH)
    bullet_style.font.size = Pt(11)
    
    formatted_lines = detect_formatting(text)
    
    for format_type, content in formatted_lines:
        if format_type == 'heading':
            doc.add_paragraph(content, style='CustomHeading')
        elif format_type == 'bullet':
            doc.add_paragraph(content, style='CustomBullet')
        else:
            doc.add_paragraph(content, style='CustomParagraph')
    
    doc.save(output_path)

def process_pdf(pdf_file):
    try:
        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf_path = temp_pdf.name
            temp_pdf.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())
        
        # Convert PDF to DOCX
        exporter = ExportPDFToDOCX(temp_pdf_path)
        docx_file = exporter.process()

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Read DOCX content
        import docx2txt
        text = docx2txt.process(docx_file)

        # Clean text using BERT
        cleaned_text = clean_text_with_bert(text)

        # Create formatted DOCX
        output_docx_path = temp_pdf_path.replace('.pdf', '_formatted.docx')
        create_formatted_docx(cleaned_text, output_docx_path)

        # Clean up temporary files
        os.remove(temp_pdf_path)
        os.remove(docx_file)

        return output_docx_path

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.File(label="Download Formatted DOCX"),
    title="PDF Cleaner and Formatter",
    description="Upload a PDF file to convert it to a formatted DOCX document."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7872/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7872/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_rel

In [16]:
### multiple bert pass throughs

import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertForMaskedLM, pipeline
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
import language_tool_python

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            return temp_file_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    # Load pre-trained BERT model for masked language modeling
    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    # Create a fill-mask pipeline
    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    # Split text into sentences
    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        
        # Randomly mask some tokens (e.g., 15% of tokens)
        masked_indices = torch.randint(0, len(tokens), (max(1, int(0.15 * len(tokens))),))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        # Convert back to a string
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        # Use the fill-mask pipeline to predict masked tokens
        results = fill_mask(masked_sentence)
        
        # Replace masked tokens with the most likely predictions
        for result in results:
            masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    # Join improved sentences
    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def detect_formatting(text):
    lines = text.split('\n')
    formatted_lines = []
    for line in lines:
        if line.isupper():
            formatted_lines.append(('heading', line))
        elif line.strip().startswith(('•', '-', '*')):
            formatted_lines.append(('bullet', line))
        elif len(line.strip()) > 0:
            formatted_lines.append(('paragraph', line))
    return formatted_lines

def create_formatted_docx(text, output_path):
    doc = Document()
    
    # Define styles
    styles = doc.styles
    heading_style = styles.add_style('CustomHeading', WD_STYLE_TYPE.PARAGRAPH)
    heading_style.font.size = Pt(14)
    heading_style.font.bold = True
    
    paragraph_style = styles.add_style('CustomParagraph', WD_STYLE_TYPE.PARAGRAPH)
    paragraph_style.font.size = Pt(11)
    
    bullet_style = styles.add_style('CustomBullet', WD_STYLE_TYPE.PARAGRAPH)
    bullet_style.font.size = Pt(11)
    
    formatted_lines = detect_formatting(text)
    
    for format_type, content in formatted_lines:
        if format_type == 'heading':
            doc.add_paragraph(content, style='CustomHeading')
        elif format_type == 'bullet':
            doc.add_paragraph(content, style='CustomBullet')
        else:
            doc.add_paragraph(content, style='CustomParagraph')
    
    doc.save(output_path)

def process_pdf(pdf_file):
    try:
        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf_path = temp_pdf.name
            temp_pdf.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())
        
        # Convert PDF to DOCX
        exporter = ExportPDFToDOCX(temp_pdf_path)
        docx_file = exporter.process()

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Read DOCX content
        import docx2txt
        text = docx2txt.process(docx_file)

        # Improve text using BERT
        improved_text = improve_text_with_bert(text)

        # Fix spelling and grammar
        corrected_text = fix_spelling_and_grammar(improved_text)

        # Create formatted DOCX
        output_docx_path = temp_pdf_path.replace('.pdf', '_formatted.docx')
        create_formatted_docx(corrected_text, output_docx_path)

        # Clean up temporary files
        os.remove(temp_pdf_path)
        os.remove(docx_file)

        return output_docx_path

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.File(label="Download Formatted DOCX"),
    title="PDF Cleaner, Improver, and Formatter",
    description="Upload a PDF file to convert it to a formatted DOCX document with improved readability and corrected spelling/grammar."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7873/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7873/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ

In [20]:
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
import language_tool_python

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            return temp_file_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    if not text.strip():
        return "The extracted text is empty. Please check the input PDF file."

    # Load pre-trained BERT model for masked language modeling
    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    # Create a fill-mask pipeline
    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    # Split text into sentences
    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        
        if not tokens:
            continue
        
        # Randomly mask some tokens (e.g., 15% of tokens)
        num_masks = max(1, int(0.15 * len(tokens)))
        masked_indices = torch.randint(0, len(tokens), (num_masks,))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        # Convert back to a string
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        # Use the fill-mask pipeline to predict masked tokens
        results = fill_mask(masked_sentence)
        
        # Replace masked tokens with the most likely predictions
        for result in results:
            if isinstance(result, dict) and 'token_str' in result:
                masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
            elif isinstance(result, list) and len(result) > 0 and 'token_str' in result[0]:
                masked_sentence = masked_sentence.replace("[MASK]", result[0]['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    # Join improved sentences
    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    if not text.strip():
        return "The input text is empty. Unable to perform spelling and grammar check."

    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def detect_formatting(text):
    lines = text.split('\n')
    formatted_lines = []
    for line in lines:
        if line.isupper():
            formatted_lines.append(('heading', line))
        elif line.strip().startswith(('•', '-', '*')):
            formatted_lines.append(('bullet', line))
        elif len(line.strip()) > 0:
            formatted_lines.append(('paragraph', line))
    return formatted_lines

def create_formatted_docx(text, output_path):
    doc = Document()
    
    # Define styles
    styles = doc.styles
    heading_style = styles.add_style('CustomHeading', WD_STYLE_TYPE.PARAGRAPH)
    heading_style.font.size = Pt(14)
    heading_style.font.bold = True
    
    paragraph_style = styles.add_style('CustomParagraph', WD_STYLE_TYPE.PARAGRAPH)
    paragraph_style.font.size = Pt(11)
    
    bullet_style = styles.add_style('CustomBullet', WD_STYLE_TYPE.PARAGRAPH)
    bullet_style.font.size = Pt(11)
    
    formatted_lines = detect_formatting(text)
    
    for format_type, content in formatted_lines:
        if format_type == 'heading':
            doc.add_paragraph(content, style='CustomHeading')
        elif format_type == 'bullet':
            doc.add_paragraph(content, style='CustomBullet')
        else:
            doc.add_paragraph(content, style='CustomParagraph')
    
    doc.save(output_path)

def process_pdf(pdf_file):
    try:
        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf_path = temp_pdf.name
            temp_pdf.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())
        
        # Convert PDF to DOCX
        exporter = ExportPDFToDOCX(temp_pdf_path)
        docx_file = exporter.process()

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Read DOCX content
        import docx2txt
        text = docx2txt.process(docx_file)

        if not text.strip():
            return "The extracted text is empty. Please check the input PDF file."

        try:
            # Improve text using BERT
            improved_text = improve_text_with_bert(text)
        except Exception as e:
            logging.exception(f"Error in BERT processing: {e}")
            improved_text = text  # Fall back to original text if BERT processing fails

        # Fix spelling and grammar
        corrected_text = fix_spelling_and_grammar(improved_text)

        # Create formatted DOCX
        output_docx_path = temp_pdf_path.replace('.pdf', '_formatted.docx')
        create_formatted_docx(corrected_text, output_docx_path)

        # Clean up temporary files
        os.remove(temp_pdf_path)
        os.remove(docx_file)

        return output_docx_path

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.File(label="Download Formatted DOCX"),
    title="PDF Cleaner, Improver, and Formatter",
    description="Upload a PDF file to convert it to a formatted DOCX document with improved readability and corrected spelling/grammar."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7875/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7875/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7875

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ

In [22]:
#### revised app
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
import language_tool_python
import re

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
                temp_file_path = temp_file.name
                with open(temp_file_path, "wb") as file:
                    file.write(stream_asset.get_input_stream())

            return temp_file_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    if not text.strip():
        return "The extracted text is empty. Please check the input PDF file."

    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        
        if not tokens:
            continue
        
        num_masks = max(1, int(0.15 * len(tokens)))
        masked_indices = torch.randint(0, len(tokens), (num_masks,))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        results = fill_mask(masked_sentence)
        
        for result in results:
            if isinstance(result, dict) and 'token_str' in result:
                masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
            elif isinstance(result, list) and len(result) > 0 and 'token_str' in result[0]:
                masked_sentence = masked_sentence.replace("[MASK]", result[0]['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    if not text.strip():
        return "The input text is empty. Unable to perform spelling and grammar check."

    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def format_document(text):
    # Remove page header information
    text = re.sub(r'WORCESTER COUNTY CIRCUIT COURT.*\n', '', text)
    
    # Format title
    title = "SUPPLEMENTARY DECLARATION, CONDITIONS AND RESTRICTIONS\nTIME SHARING OWNERSHIP - BAY CLUB CONDOMINIUM"
    if "TIME SHARING OWNERSHIP - BAY CLUB CONDOMINIUM" in text:
        text = title + "\n\n" + text.split("TIME SHARING OWNERSHIP - BAY CLUB CONDOMINIUM", 1)[1]
    else:
        text = title + "\n\n" + text
    
    # Format sections
    text = re.sub(r'(ARTICLE [IVX]+:? .*)', r'\n\n## \1\n', text)
    text = re.sub(r'([1-9]\. .*)', r'\n\1', text)
    
    # Clean up extra spaces and newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r' {2,}', ' ', text)
    
    # Add markdown formatting
    text = "# " + text
    
    return text

def create_formatted_docx(text, output_path):
    doc = Document()
    
    styles = doc.styles
    title_style = styles.add_style('CustomTitle', WD_STYLE_TYPE.PARAGRAPH)
    title_style.font.size = Pt(16)
    title_style.font.bold = True
    
    heading_style = styles.add_style('CustomHeading', WD_STYLE_TYPE.PARAGRAPH)
    heading_style.font.size = Pt(14)
    heading_style.font.bold = True
    
    subheading_style = styles.add_style('CustomSubheading', WD_STYLE_TYPE.PARAGRAPH)
    subheading_style.font.size = Pt(12)
    subheading_style.font.bold = True
    
    paragraph_style = styles.add_style('CustomParagraph', WD_STYLE_TYPE.PARAGRAPH)
    paragraph_style.font.size = Pt(11)
    
    lines = text.split('\n')
    for line in lines:
        if line.startswith('# '):
            doc.add_paragraph(line[2:], style='CustomTitle')
        elif line.startswith('## '):
            doc.add_paragraph(line[3:], style='CustomHeading')
        elif line.startswith('### '):
            doc.add_paragraph(line[4:], style='CustomSubheading')
        else:
            doc.add_paragraph(line, style='CustomParagraph')
    
    doc.save(output_path)

def process_pdf(pdf_file):
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf_path = temp_pdf.name
            temp_pdf.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())
        
        exporter = ExportPDFToDOCX(temp_pdf_path)
        docx_file = exporter.process()

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        import docx2txt
        text = docx2txt.process(docx_file)

        if not text.strip():
            return "The extracted text is empty. Please check the input PDF file."

        try:
            improved_text = improve_text_with_bert(text)
        except Exception as e:
            logging.exception(f"Error in BERT processing: {e}")
            improved_text = text

        corrected_text = fix_spelling_and_grammar(improved_text)
        
        formatted_text = format_document(corrected_text)

        output_docx_path = temp_pdf_path.replace('.pdf', '_formatted.docx')
        create_formatted_docx(formatted_text, output_docx_path)

        os.remove(temp_pdf_path)
        os.remove(docx_file)

        return output_docx_path

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.File(label="Download Formatted DOCX"),
    title="PDF Cleaner, Improver, and Formatter",
    description="Upload a PDF file to convert it to a formatted DOCX document with improved readability and corrected spelling/grammar."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7877/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7877/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7877

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ

In [23]:
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
import language_tool_python
import docx2txt

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult
from openai import OpenAI


# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self, output_path):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with open(output_path, "wb") as file:
                file.write(stream_asset.get_input_stream())

            return output_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    if not text.strip():
        return "The extracted text is empty. Please check the input PDF file."

    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        
        if not tokens:
            continue
        
        num_masks = max(1, int(0.15 * len(tokens)))
        masked_indices = torch.randint(0, len(tokens), (num_masks,))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        results = fill_mask(masked_sentence)
        
        for result in results:
            if isinstance(result, dict) and 'token_str' in result:
                masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
            elif isinstance(result, list) and len(result) > 0 and 'token_str' in result[0]:
                masked_sentence = masked_sentence.replace("[MASK]", result[0]['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    if not text.strip():
        return "The input text is empty. Unable to perform spelling and grammar check."

    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def process_pdf(pdf_file):
    try:
        # Create necessary directories
        os.makedirs('adobe_output', exist_ok=True)
        os.makedirs('final_output', exist_ok=True)

        # Save uploaded PDF
        pdf_path = os.path.join('adobe_output', 'input.pdf')
        with open(pdf_path, 'wb') as f:
            f.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())

        # Convert PDF to DOCX using Adobe
        exporter = ExportPDFToDOCX(pdf_path)
        docx_path = os.path.join('adobe_output', 'output.docx')
        docx_file = exporter.process(docx_path)

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Extract text from DOCX
        text = docx2txt.process(docx_file)

        if not text.strip():
            return "The extracted text is empty. Please check the input PDF file."

        # Improve text using BERT
        try:
            improved_text = improve_text_with_bert(text)
        except Exception as e:
            logging.exception(f"Error in BERT processing: {e}")
            improved_text = text

        # Fix spelling and grammar
        corrected_text = fix_spelling_and_grammar(improved_text)

        # Save final output
        final_output_path = os.path.join('final_output', 'processed_output.txt')
        with open(final_output_path, 'w', encoding='utf-8') as f:
            f.write(corrected_text)

        return corrected_text

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Processed Text"),
    title="PDF Cleaner, Improver, and Formatter",
    description="Upload a PDF file to convert it to a formatted text document with improved readability and corrected spelling/grammar."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7878/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7878/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7878

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ

In [24]:
####gpt2 local model###
import gradio as gr
import logging
import os
import tempfile
import torch
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline, GPT2LMHeadModel, GPT2Tokenizer
import language_tool_python
import docx2txt

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self, output_path):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with open(output_path, "wb") as file:
                file.write(stream_asset.get_input_stream())

            return output_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    if not text.strip():
        return "The extracted text is empty. Please check the input PDF file."

    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        
        if not tokens:
            continue
        
        num_masks = max(1, int(0.15 * len(tokens)))
        masked_indices = torch.randint(0, len(tokens), (num_masks,))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        results = fill_mask(masked_sentence)
        
        for result in results:
            if isinstance(result, dict) and 'token_str' in result:
                masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
            elif isinstance(result, list) and len(result) > 0 and 'token_str' in result[0]:
                masked_sentence = masked_sentence.replace("[MASK]", result[0]['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    if not text.strip():
        return "The input text is empty. Unable to perform spelling and grammar check."

    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def generate_few_shot_prompt(examples, task_description):
    prompt = f"{task_description}\n\nExamples:\n"
    for i, example in enumerate(examples, 1):
        prompt += f"Example {i}:\nInput: {example['input']}\nOutput: {example['output']}\n\n"
    prompt += "Now, please process the following text:\n"
    return prompt

def process_with_gpt2(text, prompt):
    model_name = "gpt2-medium"  # You can change this to a larger model if needed
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    full_prompt = prompt + text
    input_ids = tokenizer.encode(full_prompt, return_tensors="pt")
    
    # Generate output
    output = model.generate(input_ids, max_length=len(input_ids[0]) + 100, num_return_sequences=1, no_repeat_ngram_size=2)
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract only the newly generated part
    return generated_text[len(full_prompt):].strip()

def process_pdf(pdf_file, examples, task_description):
    try:
        # Create necessary directories
        os.makedirs('adobe_output', exist_ok=True)
        os.makedirs('final_output', exist_ok=True)

        # Save uploaded PDF
        pdf_path = os.path.join('adobe_output', 'input.pdf')
        with open(pdf_path, 'wb') as f:
            f.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())

        # Convert PDF to DOCX using Adobe
        exporter = ExportPDFToDOCX(pdf_path)
        docx_path = os.path.join('adobe_output', 'output.docx')
        docx_file = exporter.process(docx_path)

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Extract text from DOCX
        text = docx2txt.process(docx_file)

        if not text.strip():
            return "The extracted text is empty. Please check the input PDF file."

        # Improve text using BERT
        try:
            improved_text = improve_text_with_bert(text)
        except Exception as e:
            logging.exception(f"Error in BERT processing: {e}")
            improved_text = text

        # Fix spelling and grammar
        corrected_text = fix_spelling_and_grammar(improved_text)

        # Generate few-shot prompt
        prompt = generate_few_shot_prompt(examples, task_description)

        # Process with GPT-2
        final_text = process_with_gpt2(corrected_text, prompt)

        # Save final output
        final_output_path = os.path.join('final_output', 'processed_output.txt')
        with open(final_output_path, 'w', encoding='utf-8') as f:
            f.write(final_text)

        return final_text

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

def process_with_examples(pdf_file, example1_input, example1_output, example2_input, example2_output, task_description):
    examples = [
        {"input": example1_input, "output": example1_output},
        {"input": example2_input, "output": example2_output}
    ]
    return process_pdf(pdf_file, examples, task_description)

# Create Gradio interface
iface = gr.Interface(
    fn=process_with_examples,
    inputs=[
        gr.File(label="Upload PDF", type="binary"),
        gr.Textbox(label="Example 1 Input"),
        gr.Textbox(label="Example 1 Output"),
        gr.Textbox(label="Example 2 Input"),
        gr.Textbox(label="Example 2 Output"),
        gr.Textbox(label="Task Description", 
                   placeholder="e.g., 'Improve the formatting and clarity of the following legal document:'")
    ],
    outputs=gr.Textbox(label="Processed Text"),
    title="PDF Cleaner, Improver, and Formatter with Few-Shot Learning",
    description="Upload a PDF file and provide examples to guide the processing. The document will be converted, improved, and formatted based on your inputs."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7879/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7879/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7879

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (15409 > 1024). Running this sequence through the model will result in indexing errors
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
ERROR:root:Error processing PDF: index out of range in self
Traceback (most recent call last):
  File "C:\Users\austi\AppData\Local\Temp\ipykernel_26020\3347937394.py", line 177, in process_pdf
    final_text = process_with_gpt2(corrected_text, prompt)
  File "C:\Users\austi\AppData\Local\Temp\ipykernel_26020\3347937394.py", line 129, in process_with_gpt2
    output = model.generate(input_ids, max_length=len(input_ids[0]) + 100, num_return_sequences=1, no_repeat_ngram_size=2)
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\torch\autogra

In [27]:
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\huggingface_hub\file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:openai:error_code=rate_limit_exceeded error_message='Request too large for gpt-4 in organization org-StNcD0mXFRdWnl1IFhLonLQi on tokens per min (TPM): Limit 10000, Requested 15571. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.' error_param=None error_type=tokens message='OpenAI API error received' stream_error=False
ERROR:root:Error in GPT-4 processing: Request too large for gpt-4 in organization org-StNcD0mXFRdWnl1IFhLonLQi on tokens per min (TPM): Limit 10000, Requested 15571. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.
Traceback (most recent call last):
  File "C:\Users\austi\AppData\Local\Temp\ipykernel_26020\2701643328.py", line 125, in process_with_gpt4
    response = openai.ChatCompletion.create(
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\openai\api_resources\chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\openai\api_resources\abstract\engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\openai\api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\openai\api_requestor.py", line 619, in _interpret_response
    self._interpret_response_line(
  File "C:\Users\austi\AppData\Roaming\Python\Python310\site-packages\openai\api_requestor.py", line 679, in _interpret_response_line
    raise self.handle_error_response(
openai.error.RateLimitError: Request too large for gpt-4 in organization org-StNcD0mXFRdWnl1IFhLonLQi on tokens per min (TPM): Limit 10000, Requested 15571. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.


INFO:httpx:HTTP Request: GET http://127.0.0.1:7882/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7882/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7882

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with

In [32]:
import gradio as gr
import logging
import os
import tempfile
import torch
import time
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
import language_tool_python
import docx2txt
import openai

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

# Set up OpenAI API key
openai.api_key = "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"
class ExportPDFToDOCX:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.credentials = ServicePrincipalCredentials(
            client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
            client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
        )
        self.pdf_services = PDFServices(credentials=self.credentials)

    def process(self, output_path):
        try:
            with open(self.pdf_path, 'rb') as file:
                input_stream = file.read()

            input_asset = self.pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
            export_pdf_params = ExportPDFParams(target_format=ExportPDFTargetFormat.DOCX)
            export_pdf_job = ExportPDFJob(input_asset=input_asset, export_pdf_params=export_pdf_params)

            location = self.pdf_services.submit(export_pdf_job)
            pdf_services_response = self.pdf_services.get_job_result(location, ExportPDFResult)

            result_asset: CloudAsset = pdf_services_response.get_result().get_asset()
            stream_asset: StreamAsset = self.pdf_services.get_content(result_asset)

            with open(output_path, "wb") as file:
                file.write(stream_asset.get_input_stream())

            return output_path

        except ServiceApiException as e:
            if "CORRUPT_DOCUMENT" in str(e):
                logging.error(f"The input PDF file appears to be corrupted: {e}")
                return "CORRUPT_DOCUMENT"
            else:
                logging.exception(f'Service API Exception encountered while converting PDF: {e}')
                return None
        except (ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while converting PDF: {e}')
            return None

def improve_text_with_bert(text):
    if not text.strip():
        return "The extracted text is empty. Please check the input PDF file."

    model_name = "bert-base-uncased"
    model = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    sentences = text.split('.')
    improved_sentences = []

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        
        if not tokens:
            continue
        
        num_masks = max(1, int(0.15 * len(tokens)))
        masked_indices = torch.randint(0, len(tokens), (num_masks,))
        for idx in masked_indices:
            tokens[idx] = "[MASK]"
        
        masked_sentence = tokenizer.convert_tokens_to_string(tokens)
        
        results = fill_mask(masked_sentence)
        
        for result in results:
            if isinstance(result, dict) and 'token_str' in result:
                masked_sentence = masked_sentence.replace("[MASK]", result['token_str'], 1)
            elif isinstance(result, list) and len(result) > 0 and 'token_str' in result[0]:
                masked_sentence = masked_sentence.replace("[MASK]", result[0]['token_str'], 1)
        
        improved_sentences.append(masked_sentence)

    improved_text = '. '.join(improved_sentences)

    return improved_text

def fix_spelling_and_grammar(text):
    if not text.strip():
        return "The input text is empty. Unable to perform spelling and grammar check."

    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def generate_few_shot_prompt(examples, task_description):
    prompt = f"{task_description}\n\nExamples:\n"
    for i, example in enumerate(examples, 1):
        prompt += f"Example {i}:\nInput: {example['input']}\nOutput: {example['output']}\n\n"
    prompt += "Now, please process the following text:\n"
    return prompt

def estimate_tokens(text):
    return max(1, len(text.split()))  # Ensure we never return 0

def process_with_gpt4(text, prompt):
    try:
        max_tokens = 4000  # Maximum tokens allowed per request
        estimated_tokens = estimate_tokens(text)
        text_length = max(1, len(text))  # Ensure we never divide by zero
        
        # Calculate chunk size, ensuring it's at least 1 and at most the text length
        chunk_size = max(1, min(text_length, max_tokens // max(1, (estimated_tokens // text_length))))
        
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        
        processed_chunks = []
        
        for i, chunk in enumerate(chunks):
            chunk_prompt = f"{prompt}\n\nPart {i+1} of {len(chunks)}:\n{chunk}"
            
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that processes and improves documents based on given examples and instructions."},
                    {"role": "user", "content": chunk_prompt}
                ],
                max_tokens=1000,  # Adjust as needed
                n=1,
                temperature=0.7,
            )
            
            processed_chunk = response.choices[0].message['content'].strip()
            processed_chunks.append(processed_chunk)
            
            # Sleep for a short time to avoid hitting rate limits
            time.sleep(20)  # Adjust this value based on your API rate limits
        
        return " ".join(processed_chunks)
    except Exception as e:
        logging.exception(f"Error in GPT-4 processing: {e}")
        return f"Error occurred during GPT-4 processing: {str(e)}"

def process_pdf(pdf_file, examples, task_description):
    try:
        # Create necessary directories
        os.makedirs('adobe_output', exist_ok=True)
        os.makedirs('final_output', exist_ok=True)

        # Save uploaded PDF
        pdf_path = os.path.join('adobe_output', 'input.pdf')
        with open(pdf_path, 'wb') as f:
            f.write(pdf_file if isinstance(pdf_file, bytes) else pdf_file.read())

        # Convert PDF to DOCX using Adobe
        exporter = ExportPDFToDOCX(pdf_path)
        docx_path = os.path.join('adobe_output', 'output.docx')
        docx_file = exporter.process(docx_path)

        if docx_file is None:
            return "Error occurred during PDF to DOCX conversion."
        elif docx_file == "CORRUPT_DOCUMENT":
            return "The uploaded PDF file appears to be corrupted. Please check the file and try again."

        # Extract text from DOCX
        text = docx2txt.process(docx_file)

        if not text.strip():
            return "The extracted text is empty. Please check the input PDF file."

        # Improve text using BERT
        try:
            improved_text = improve_text_with_bert(text)
        except Exception as e:
            logging.exception(f"Error in BERT processing: {e}")
            improved_text = text

        # Fix spelling and grammar
        corrected_text = fix_spelling_and_grammar(improved_text)

        # Generate few-shot prompt
        prompt = generate_few_shot_prompt(examples, task_description)

        # Process with GPT-4
        final_text = process_with_gpt4(corrected_text, prompt)

        # Save final output
        final_output_path = os.path.join('final_output', 'processed_output.txt')
        with open(final_output_path, 'w', encoding='utf-8') as f:
            f.write(final_text)

        return final_text

    except Exception as e:
        logging.exception(f"Error processing PDF: {e}")
        return f"An error occurred while processing the PDF: {str(e)}"

def process_with_examples(pdf_file, example1_input, example1_output, example2_input, example2_output, task_description):
    examples = [
        {"input": example1_input, "output": example1_output},
        {"input": example2_input, "output": example2_output}
    ]
    return process_pdf(pdf_file, examples, task_description)

# Create Gradio interface
iface = gr.Interface(
    fn=process_with_examples,
    inputs=[
        gr.File(label="Upload PDF", type="binary"),
        gr.Textbox(label="Example 1 Input"),
        gr.Textbox(label="Example 1 Output"),
        gr.Textbox(label="Example 2 Input"),
        gr.Textbox(label="Example 2 Output"),
        gr.Textbox(label="Task Description", 
                   placeholder="e.g., 'Improve the formatting and clarity of the following legal document:'")
    ],
    outputs=gr.Textbox(label="Processed Text"),
    title="PDF Cleaner, Improver, and Formatter with GPT-4",
    description="Upload a PDF file and provide examples to guide the processing. The document will be converted, improved, and formatted using GPT-4 based on your inputs."
)

# Launch the app
iface.launch()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7886/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7886/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7886

To create a public link, set `share=True` in `launch()`.




INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXPORT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializ