# PDF to DOCX Converter

## This script converts a PDF into a DOCX. Each page of the PDF is rendered to a JPG and then inserted into a new Word document.

Libraries used:
- python-docx [https://pypi.org/project/python-docx/]
- PyPDF2 [https://pypi.org/project/PyPDF2/]
- pdf2image [https://pypi.org/project/pdf2image/]

Binaries required:
- install Poppler:  https://github.com/oschwartz10612/poppler-windows/releases/ and make sure it's in the PATH
- or install via conda install -c conda-forge poppler 

In [None]:
!!pip install python-docx
!!pip install PyPDF2
!!pip install pdf2image
# Download and install Poppler (PDF tools) https://github.com/oschwartz10612/poppler-windows/releases/
# or install via conda
!!conda install -c conda-forge poppler

In [None]:
def pdf2jpeg_poppler(pdf_input_path, dpi=200):
    from pdf2image import convert_from_path
    import os
    import tempfile

    num_pages = get_PDF_pagecount(pdf_input_path)
    print("Processing PDF with %d pages..." % num_pages)
        
    temp_fn_lst = []
    fname_jpg_pattern = 'tmp_pdf_to_jpg_page_%d.jpg'
    i = 0
    
    pages = convert_from_path(pdf_input_path)
    
    for page in log_progress(pages, every=1, name="Pages"):
        i = i + 1
        path = os.path.join(tempfile.gettempdir(),  fname_jpg_pattern % i)
        temp_fn_lst.append(path)
        page.save(path, 'JPEG')
        
        #print("Saved screenshot of page %d to tempfile %s" % (i, path))
        
    return temp_fn_lst

def is_pdf_landscape(input_pdf_fname):
    from PyPDF2 import PdfFileReader
    with open(input_pdf_fname,'rb') as f:
        pdf = PdfFileReader(f)
        
        page = pdf.getPage(0).mediaBox

        if page.getUpperRight_x() - page.getUpperLeft_x() > page.getUpperRight_y() - page.getLowerRight_y():
            return True
    return False
    
def get_PDF_pagecount(input_pdf_fname):
    from PyPDF2 import PdfFileReader
    with open(input_pdf_fname,'rb') as f:
        pdf = PdfFileReader(f)
        
        return pdf.getNumPages()
    return -1

def create_docx_from_jpgs(fn_lst, output_docx_fname, fLandscapeMode = False):
    from docx import Document
    from docx.shared import Inches
    from docx.enum.section import WD_ORIENT
    
    try:
        document = Document()

        sections = document.sections
        margin = 0.5
        for section in sections:
            if fLandscapeMode:
                #print('Formatting landscape orientation')
                section.orientation = WD_ORIENT.LANDSCAPE
                new_width, new_height = section.page_height, section.page_width
                section.page_width = new_width
                section.page_height = new_height                
                
            section.top_margin = Inches(margin)
            section.bottom_margin = Inches(margin)
            section.left_margin = Inches(margin)
            section.right_margin = Inches(margin)

        for jpg_fname in fn_lst:
            if fLandscapeMode:
                document.add_picture(jpg_fname, width=Inches(10))
            else:
                document.add_picture(jpg_fname, width=Inches(7.5))
            document.save(output_docx_fname)
        print("DOCX saved to: %s" % output_docx_fname)
    except Error as e:
        if document is not None:
            document.save()

def convert_pdf_to_DOCX(input_pdf_fname, output_docx_fname):
    import os
    temp_fn_lst = pdf2jpeg_poppler (input_pdf_fname)
    
    fLandscapeMode = is_pdf_landscape (input_pdf_fname)
   
    print("Preparing DOCX...")
    create_docx_from_jpgs(temp_fn_lst, output_docx_fname, fLandscapeMode)

    # cleanup screenshots
    for fname in temp_fn_lst:
        os.remove(fname)

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

##  Create download link for converted DOCX

In [None]:
local_file_link = None
class converter():
    def __init__(self):
        import ipywidgets as widgets
        from ipywidgets import Layout
        from functools import partial
        
        self.local_file_link = None
        layout = widgets.Layout(width='auto', height='40px') #set width and height
        self.input_box = widgets.Combobox(
            placeholder='Choose or input URL',
            options=['https://www.engageny.org/file/128466/download/precalculus-m3-topic-a-lesson-3-teacher.pdf',
                     'https://datacamp-community-prod.s3.amazonaws.com/48093c40-5303-45f4-bbf9-0c96c0133c40',
                    'https://education.github.com/git-cheat-sheet-education.pdf',
                    'https://computing.cs.cmu.edu/desktop/resources/w10-shortcuts.pdf',
                    'https://tutorial.math.lamar.edu/pdf/Trig_Cheat_Sheet.pdf',
                     'https://bg.battletech.com/download/CAT3500A_BoxSet_RecordSheets_with_Counters.pdf',
                    'https://www.engageny.org/file/128466/download/precalculus-m3-topic-a-lesson-3-teacher.pdf'],
            description='Location:',
            ensure_option=False,
            display='flex', layout=layout
        )
                
        self.button_convert = widgets.Button(description = 'Convert')
        self.button_convert.on_click(self.clicked)
        self.render()
            
    def render(self):
        from IPython.display import display, FileLink, clear_output
        clear_output()
        self.input_box.value = ''
        display(self.input_box)
        display(self.button_convert)
        
    def render_link(self):
        if self.local_file_link is not None:
            print('Download link below')
            display(FileLink(self.local_file_link))
        
    
    def clicked(self, arg):
        from urllib.parse import urlparse, quote
        import os 
        import urllib.request

        url_to_pdf = self.input_box.value
        
        o = urlparse(url_to_pdf)
        output_docx_fname = quote(os.path.basename(o.path.replace(".pdf", ".docx")))

        if '.docx' not in output_docx_fname:
            output_docx_fname = output_docx_fname + '.docx'

        # download PDF and convert it
        input_pdf_fname = 'local-download-pdf.pdf'

        urllib.request.urlretrieve(url_to_pdf, input_pdf_fname)

        convert_pdf_to_DOCX(input_pdf_fname, output_docx_fname)
        self.local_file_link = output_docx_fname
        self.render()
        self.render_link()
        
c = converter()