# This notebook is to Split the PDF, Extraction of Data from PDF and Convert the Data into English Language

### 1. Split the PDF

Installation required:-
    pip install PyMuPDF

In [27]:
import fitz  # PyMuPDF
import os

def split_and_delete_pdf(input_pdf_path, output_folder):
    try:
        # Ensure the output folder exists
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Open the input PDF
        pdf_document = fitz.open(input_pdf_path)

        for page_number in range(len(pdf_document)):
            # Extract each page
            page = pdf_document[page_number]
            
            # Create a new PDF document with just this page
            new_pdf_document = fitz.open()
            new_pdf_document.insert_pdf(pdf_document, from_page=page_number, to_page=page_number)

            # Save the extracted page to the output folder
            output_filename = os.path.join(output_folder, f"page_{page_number + 1}.pdf")
            new_pdf_document.save(output_filename)
            new_pdf_document.close()

        # Close the input PDF
        pdf_document.close()

        # Delete the second file in the output folder (page 2)
        file_to_delete = os.path.join(output_folder, "page_2.pdf")
        if os.path.exists(file_to_delete):
            os.remove(file_to_delete)

        print("successfully split the PDF and deleted the second PDF")
    except Exception as e:
        print(f"Error: {str(e)}")




In [28]:
# Example usage:
input_pdf_path = "F:\Election Commission\Input Folder\PDF Data\Jotta.pdf"  # Replace with the path to your input PDF file
output_folder = "F:/Election Commission/Input Folder/Split_PDF_Data"      # Replace with the path to your output folder

split_and_delete_pdf(input_pdf_path, output_folder)

successfully split the PDF and deleted the second PDF


### 2 Extract the PDF Data and provide the output in txt files

Installation required:- pip install Pytessract ,PopplePoppler-22.04.0

In [29]:
import os
import pytesseract
from pdf2image import convert_from_path

def extract_text_from_pdfs(input_folder, output_folder):
    try:
        # Ensure the output folder exists, create it if necessary
        os.makedirs(output_folder, exist_ok=True)

        # List all PDF files in the input folder
        pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

        for pdf_file in pdf_files:
            input_pdf_path = os.path.join(input_folder, pdf_file)
            output_text_file = os.path.splitext(pdf_file)[0] + '.txt'
            output_text_path = os.path.join(output_folder, output_text_file)

            # Create a folder to store temporary image files
            temp_image_folder = os.path.join(output_folder, 'temp_images')
            os.makedirs(temp_image_folder, exist_ok=True)

            # Convert PDF pages to images
            images = convert_from_path(input_pdf_path, output_folder=temp_image_folder, dpi=300)

            # Initialize a variable to store extracted text
            extracted_text = ''

            # Define Tesseract configuration for multiple languages (Hindi, English, and digits)
            tesseract_config = '--psm 6 --oem 1 -l hin+eng+osd'

            # Iterate through each image
            for image in images:
                # Perform OCR on the image to extract text
                text = pytesseract.image_to_string(image, config=tesseract_config)

                # Append the extracted text to the result
                extracted_text += text

            # Clean up temporary image files
            for image_file in os.listdir(temp_image_folder):
                image_path = os.path.join(temp_image_folder, image_file)
                os.remove(image_path)

            # Save the extracted text to the specified output text file
            with open(output_text_path, 'w', encoding='utf-8') as output_text_file:
                output_text_file.write(extracted_text)

            print(f"Text extracted from '{pdf_file}' and saved to '{output_text_file}'")

        return "Text extraction completed successfully for all files in the folder"

    except Exception as e:
        return f"Error: {str(e)}"


In [30]:
input_folder = 'F:\Election Commission\Input Folder\Split_PDF_Data'
output_folder = 'F:\Election Commission\Input Folder\Hindi_Text_Data'

# Call the function to process PDFs in the input folder
result = extract_text_from_pdfs(input_folder, output_folder)
print(result)

Text extracted from 'page_1.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_1.txt' mode='w' encoding='utf-8'>'
Text extracted from 'page_10.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_10.txt' mode='w' encoding='utf-8'>'
Text extracted from 'page_11.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_11.txt' mode='w' encoding='utf-8'>'
Text extracted from 'page_12.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_12.txt' mode='w' encoding='utf-8'>'
Text extracted from 'page_13.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_13.txt' mode='w' encoding='utf-8'>'
Text extracted from 'page_14.pdf' and saved to '<_io.TextIOWrapper name='F:\\Election Commission\\Input Folder\\Hindi_Text_Data\\page_14.txt' mode='w' e

### 3. Translate all files data from Hindi to English

In [31]:
from googletrans import Translator
import os

def translate_hindi_files_to_english(input_folder, output_folder):
    # Ensure the output folder exists, create it if necessary
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize the Google Translator
    translator = Translator()

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            # Construct the full paths for input and output files
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Read the input file with Hindi text
            with open(input_file_path, 'r', encoding='utf-8') as input_file:
                hindi_text = input_file.read()

            # Translate the Hindi text to English
            translated = translator.translate(hindi_text, src='hi', dest='en')

            # Get the translated English text
            english_text = translated.text

            # Save the translated text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(english_text)

            print(f"Translation completed for '{filename}' and saved to: {output_file_path}")

In [32]:
input_folder = 'F:/Election Commission/Input Folder/Hindi_Text_Data'
output_folder = 'F:/Election Commission/Input Folder/English_Text_Data'

translate_hindi_files_to_english(input_folder, output_folder)

Translation completed for 'page_1.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_1.txt
Translation completed for 'page_10.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_10.txt
Translation completed for 'page_11.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_11.txt
Translation completed for 'page_12.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_12.txt
Translation completed for 'page_13.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_13.txt
Translation completed for 'page_14.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_14.txt
Translation completed for 'page_15.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_15.txt
Translation completed for 'page_16.txt' and saved to: F:/Election Commission/Input Folder/English_Text_Data\page_16.txt
Translation completed for 'page_17.txt' an