In [None]:
!apt-get install -y poppler-utils
!apt-get install -y libpoppler-cpp-dev
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!apt-get install -y tesseract-ocr-fas

In [None]:
!pip install -q tqdm
!pip install -q pdf2image
!pip install -q pytesseract
!pip install -q Pillow
!pip install -q python-docx
!pip install PyPDF2
!pip install pdftotext

In [None]:
from tqdm import tqdm
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re
import os
import tempfile
from PyPDF2 import PdfReader, PdfWriter

In [None]:
pdf = PdfReader('/content/drive/MyDrive/total.pdf')
for page in range(len(pdf.pages)):
    #Split pdf
    pdf_writer = PdfWriter()
    pdf_writer.add_page(pdf.pages[page])

    output_filename = '{}_page_{}.pdf'.format('part', page+1)

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print('Created: {}'.format(output_filename))

In [None]:
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def remove_non_xml_chars(text):
    """
    Remove non-XML-compatible characters from the text.
    """
    return re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text)
def pdf_to_word(pdf_path, output_dir, lang='fas', **kwargs):

    pdf_name = pdf_path.split('/')[-1].split('.')[0]
    pages = convert_from_path(pdf_path, **kwargs)
    texts = []

    print(f'PDF is preparing to convert into document [#{len(pages)} pages]')
    for i, page in tqdm(enumerate(pages), position=0):

        with tempfile.TemporaryDirectory() as img_dir:
            img_name = f'{pdf_name}-{i+1}.jpg'
            img_path = os.path.join(img_dir, img_name)

            page.save(img_path, 'JPEG')
            text = pytesseract.image_to_string(Image.open(img_path), lang=lang)
            cleaned_text = remove_non_xml_chars(text)
            texts.append(cleaned_text)

    document = Document()
    style_normal = document.styles['Normal']
    font = style_normal.font
    font.name = 'Arial'
    font.rtl = True

    style_h1 = document.styles['Heading 1']
    font = style_h1.font
    font.name = 'Arial'
    font.rtl = True

    for i, text in tqdm(enumerate(texts), position=0):
        heading = document.add_heading(f'صفحه: {i+1}', level=1)
        heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        heading.style = document.styles['Heading 1']

        paragraph = document.add_paragraph(text.encode('utf-8').decode('utf-8'))
        paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        paragraph.style = document.styles['Normal']

    output_path = os.path.join(output_dir, f'{pdf_name}.docx')
    document.save(output_path)
    print(f'Done! Your document can be find here "{output_path}"')

In [None]:
pdf_to_word('/content/part_page_105.pdf', '/content/drive/MyDrive/')