In [18]:
import fitz  # PyMuPDF
import requests
import uuid
import os
import time

# Azure Translator API settings
key = ""  # Replace with your actual Azure Translator key
endpoint = "https://api.cognitive.microsofttranslator.com"  # Correct endpoint
location = ""  # e.g., "eastus"

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

# Function to translate text to Portuguese using Azure Translator
def translate_to_portuguese(text):
    url = f"{endpoint}/translate"
    
    headers = {
        'Ocp-Apim-Subscription-Key': key,
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    translated_text = ""
    max_chunk_size = 5000
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    for chunk in text_chunks:
        body = [{'text': chunk}]
        
        params = {
            'api-version': '3.0',
            'to': 'pt-br'  # Translate to Brazilian Portuguese
        }

        retries = 5
        for attempt in range(retries):
            response = requests.post(url, headers=headers, params=params, json=body)
            
            if response.status_code == 200:
                translation = response.json()[0]['translations'][0]['text']
                translated_text += translation + " "  # Append translated chunk
                break  # Exit the retry loop if successful
            elif response.status_code == 429:  # Rate limit exceeded
                print("Rate limit exceeded, retrying...")
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                print(f"Translation failed for chunk: {response.status_code} - {response.text}")
                break  # Exit on other errors

    return translated_text.strip()  # Return the complete translated text

# Function to extract images from a PDF
def extract_images_from_pdf(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            images = page.get_images(full=True)

            for img_index, img in enumerate(images):
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_filename = f"page_{page_number + 1}_img_{img_index + 1}.{image_ext}"

                with open(os.path.join(output_folder, image_filename), "wb") as image_file:
                    image_file.write(image_bytes)

# Main function
def main():
    pdf_path = r''  # Update your PDF path here
    output_folder = r''  # Output folder for images

    # Extract and translate text
    text = extract_text_from_pdf(pdf_path)
    
    if text:
        translated_text = translate_to_portuguese(text)
        if translated_text:
            print("Translated Text:")
            print(translated_text)
        else:
            print("Translation failed or returned empty text.")
    else:
        print("No text found in the PDF.")

    # Extract images
    extract_images_from_pdf(pdf_path, output_folder)
    print(f"Images extracted to {output_folder}.")

if __name__ == "__main__":
    main()


Rate limit exceeded, retrying...
Rate limit exceeded, retrying...
Rate limit exceeded, retrying...


KeyboardInterrupt: 

In [1]:
import os
import json
import requests
import pdfplumber

# Function to extract text from a PDF and save it to a text file
def extract_text_from_pdf_and_save(pdf_path, output_text_file):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()

    with open(output_text_file, 'w', encoding='utf-8') as file:
        file.write(text)

# Function to read text from a file
def read_text_from_file(input_text_file):
    with open(input_text_file, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function to translate text to Portuguese using the local Llama model API
def translate_to_portuguese(text):
    translated_chunks = []
    max_chunk_size = 5000  # Adjust based on the model's capabilities
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    for chunk in text_chunks:
        api_request_json = {
            "model": "llama3.2",
            "messages": [
                {"role": "user", "content": f"Translate the following text to Portuguese: {chunk}"},
            ],
        }

        print(f"Sending request: {json.dumps(api_request_json, indent=2)}")

        response = requests.post("http://localhost:11434/api/chat", json=api_request_json)
        print(f"Response status code: {response.status_code}")
        print(f"Response content: {response.content}")

        if response.status_code == 200:
            try:
                # Attempt to parse the JSON response
                response_json = response.json()
                print(f"Response JSON: {json.dumps(response_json, indent=2)}")
                if "choices" in response_json and len(response_json["choices"]) > 0:
                    translation = response_json["choices"][0]["message"]["content"]
                    translated_chunks.append(translation)  # Store the translated chunk
                else:
                    print("Unexpected response structure:", response_json)
                    return None
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return None
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None

    return ' '.join(translated_chunks).strip()  # Concatenate and return the complete translated text

# Function to save translated text to a file
def save_translated_text(translated_text, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(translated_text)

# Function to extract images from a PDF
def extract_images_from_pdf(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with pdfplumber.open(pdf_path) as pdf_document:
        for page_number, page in enumerate(pdf_document.pages):
            images = page.images

            for img_index, img in enumerate(images):
                image_bytes = img["stream"].get_data()
                image_ext = img["ext"]
                image_filename = f"page_{page_number + 1}_img_{img_index + 1}.{image_ext}"

                with open(os.path.join(output_folder, image_filename), "wb") as image_file:
                    image_file.write(image_bytes)

# Main function
def main():
    pdf_path = r'C:\Users\carli\OneDrive\Documents\VA.pdf'  # Update your PDF path here
    output_folder = r'C:\Users\carli\OneDrive\Documents\ExtractedImages'  # Output folder for images
    extracted_text_file = r'C:\Users\carli\OneDrive\Documents\ExtractedText.txt'  # Output file for extracted text
    translated_text_file = r'C:\Users\carli\OneDrive\Documents\TranslatedText.txt'  # Output file for translated text

    # Extract text from PDF and save to a text file
    extract_text_from_pdf_and_save(pdf_path, extracted_text_file)
    print(f"Text extracted and saved to {extracted_text_file}")

    # Read the extracted text from the file
    text = read_text_from_file(extracted_text_file)

    if text:
        # Translate the text to Portuguese
        translated_text = translate_to_portuguese(text)
        if translated_text:
            # Save the translated text to a file
            save_translated_text(translated_text, translated_text_file)
            print(f"Translated text saved to {translated_text_file}")
        else:
            print("Translation failed or returned empty text.")
    else:
        print("No text found in the extracted text file.")

    # Extract images from PDF
    extract_images_from_pdf(pdf_path, output_folder)
    print(f"Images extracted to {output_folder}.")

if __name__ == "__main__":
    main()



Text extracted and saved to C:\Users\carli\OneDrive\Documents\ExtractedText.txt
Sending request: {
  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
      "content": "Translate the following text to Portuguese: 2 Vatican AssassinsTitle Page, Copyright, Assassins & Maxims 3\nVatican Assassins:\n\u201cWounded In The House Of My Friends\u201d\nThe Diabolical History of\nThe Society of Jesus\nIncluding:\nIts Second Thirty Years\u2019 War (1914 - 1945),\nIts Cold War (1945 - 1989),\nAnd Its\nAssassination of America\u2019s First\nRoman Catholic President,\nKnight of Columbus\nJohn Fitzgerald Kennedy (1963)\nBy\nEric Jon Phelps\nWhite Separatist American Freeman\nDispensational, Fifth Monarchy, Seventh-Day Baptist-Calvinist4 Vatican Assassins\nVatican Assassins:\n\u201cWounded In The House Of My Friends\u201d\nEric Jon Phelps\nCopyright \u00a9 2001\nSecond Edition: CD/PDF March 2004\nAll rights reserved. Permission is hereby granted to anyone to quote\nFrom VATICAN ASSASSINS

KeyError: 'ext'

In [1]:
import requests
import json
import time

# Replace with your own subscription key and endpoint
subscription_key = 'f0ef490f36bb47e09016d37d204526e6'
endpoint = 'https://api.cognitive.microsofttranslator.com'
location = 'brazilsouth'  # e.g., 'global'

# Read the text file
with open('ExtractedText.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Function to split text into chunks of a given size
def split_text_into_chunks(text, chunk_size):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Split the text into chunks of 10,000 characters
chunks = split_text_into_chunks(text, 10000)

# Set up the URL for the translation API
url = f'{endpoint}/translate?api-version=3.0&to=pt'  # Translate to Portuguese (Brazil)

# Set up the headers
headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Ocp-Apim-Subscription-Region': location,
    'Content-Type': 'application/json'
}

# Initialize a list to store translated chunks in order
translated_chunks = [None] * len(chunks)

# Function to handle the translation with exponential backoff
def translate_chunk_with_backoff(chunk, max_retries=5, initial_delay=1):
    delay = initial_delay
    for attempt in range(max_retries):
        body = [{
            'text': chunk
        }]

        # Make the request to the translation API
        response = requests.post(url, headers=headers, json=body)

        # Check if the request was successful
        if response.status_code == 200:
            try:
                translations = response.json()
                translated_text = translations[0]['translations'][0]['text']
                return translated_text
            except json.JSONDecodeError:
                print("Error: The response is not in JSON format.")
                print(response.text)
                return None
        elif response.status_code == 429:
            print(f"Rate limit exceeded. Waiting {delay} seconds before retrying...")
            time.sleep(delay)
            delay *= 2  # Exponential backoff
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            return None
    return None

# Translate each chunk with exponential backoff and retry failed chunks
for index, chunk in enumerate(chunks):
    translated_text = translate_chunk_with_backoff(chunk)
    if translated_text:
        translated_chunks[index] = translated_text
        print(f"Chunk {index + 1}/{len(chunks)} translated successfully.")
    else:
        print(f"Failed to translate chunk {index + 1}/{len(chunks)}. Retrying...")
        retry_attempts = 0
        while translated_text is None and retry_attempts < 5:
            translated_text = translate_chunk_with_backoff(chunk)
            retry_attempts += 1
            if translated_text:
                translated_chunks[index] = translated_text
                print(f"Chunk {index + 1}/{len(chunks)} translated successfully after {retry_attempts} retries.")
            else:
                print(f"Retry {retry_attempts} failed for chunk {index + 1}/{len(chunks)}.")

# Check if any chunks failed to translate
failed_chunks = [index for index, chunk in enumerate(translated_chunks) if chunk is None]
if failed_chunks:
    print(f"Failed to translate chunks: {failed_chunks}")
else:
    # Combine the translated chunks
    translated_text = ''.join(translated_chunks)

    # Save the translated text to a file
    with open('TranslatedText.txt', 'w', encoding='utf-8') as file:
        file.write(translated_text)

    print("Translation complete. The translated text has been saved to 'TranslatedText.txt'.")

Chunk 1/161 translated successfully.
Chunk 2/161 translated successfully.
Chunk 3/161 translated successfully.
Rate limit exceeded. Waiting 1 seconds before retrying...
Rate limit exceeded. Waiting 2 seconds before retrying...
Rate limit exceeded. Waiting 4 seconds before retrying...
Rate limit exceeded. Waiting 8 seconds before retrying...
Rate limit exceeded. Waiting 16 seconds before retrying...
Failed to translate chunk 4/161. Retrying...
Rate limit exceeded. Waiting 1 seconds before retrying...
Rate limit exceeded. Waiting 2 seconds before retrying...
Rate limit exceeded. Waiting 4 seconds before retrying...
Rate limit exceeded. Waiting 8 seconds before retrying...
Chunk 4/161 translated successfully after 1 retries.
Chunk 5/161 translated successfully.
Chunk 6/161 translated successfully.
Rate limit exceeded. Waiting 1 seconds before retrying...
Rate limit exceeded. Waiting 2 seconds before retrying...
Rate limit exceeded. Waiting 4 seconds before retrying...
Rate limit exceeded.