<a href="https://colab.research.google.com/github/M-110/automate-the-boring-stuff/blob/main/15_Working_With_PDF_and_Word_Documents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2 -q

[?25l[K     |████▎                           | 10 kB 17.5 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 21.5 MB/s eta 0:00:01[K     |████████████▊                   | 30 kB 25.9 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 27.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 51 kB 29.6 MB/s eta 0:00:01[K     |█████████████████████████▍      | 61 kB 32.5 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71 kB 33.9 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 5.0 MB/s 
[?25h  Building wheel for PyPDF2 (setup.py) ... [?25l[?25hdone


In [None]:
!wget https://nostarch.com/download/Automate_the_Boring_Stuff_onlinematerials_v.2.zip -q
!unzip -q Automate_the_Boring_Stuff_onlinematerials_v.2.zip
!rm Automate_the_Boring_Stuff_onlinematerials_v.2.zip
!mv automate_online-materials materials

# PDF

In [None]:
import PyPDF2 as pdf

In [None]:
with open('materials/meetingminutes.pdf', 'rb') as pdf_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  print(pdf_reader.getPage(0).extractText())

OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of 
March 7
, 2014
        
     The Board of Elementary and Secondary Education shall provide leadership and 
create policies for education that expand opportunities for children, empower 
families and communities, and advance Louisiana in an increasingly 
competitive glob
al market.
 BOARD 
 of ELEMENTARY
 and 
 SECONDARY
 EDUCATION
  


## Decrypt

In [None]:
with open('materials/encrypted.pdf', 'rb') as pdf_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  print(pdf_reader.isEncrypted)

True


In [None]:
with open('materials/encrypted.pdf', 'rb') as pdf_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  pdf_reader.decrypt('rosebud')
  print(pdf_reader.getPage(0).extractText())

PdfReadError: ignored

## Merge Two PDFs


In [None]:
with open('materials/meetingminutes.pdf', 'rb') as pdf_1, \
     open('materials/meetingminutes2.pdf', 'rb') as pdf_2:
  reader_1 = pdf.PdfFileReader(pdf_1)
  reader_2 = pdf.PdfFileReader(pdf_2)
  pdf_writer = pdf.PdfFileWriter()
  for page in range(reader_1.numPages):
    pdf_writer.addPage(reader_1.getPage(page))

  for page in range(reader_2.numPages):
    pdf_writer.addPage(reader_2.getPage(page))

  with open('combined_minutes.pdf', 'wb') as output:
    pdf_writer.write(output)

## Rotate

In [None]:
with open('materials/meetingminutes.pdf', 'rb') as pdf_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  page = pdf_reader.getPage(0)
  page.rotateClockwise(90)
  pdf_writer = pdf.PdfFileWriter()
  pdf_writer.addPage(page)
  with open('rotated_page.pdf', 'wb') as output:
    pdf_writer.write(output)

## Overlaying pages

In [None]:
with open('materials/meetingminutes.pdf', 'rb') as pdf_file, \
     open('materials/watermark.pdf', 'rb') as watermark_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  watermark_reader = pdf.PdfFileReader(watermark_file)
  watermark = watermark_reader.getPage(0)
  pdf_writer = pdf.PdfFileWriter()
  for i in range(pdf_reader.numPages):
    page = pdf_reader.getPage(i)
    page.mergePage(watermark)
    pdf_writer.addPage(page)
  with open('watermarked_minutes.pdf', 'wb') as output:
    pdf_writer.write(output)

## Encrypting Pages

In [None]:
with open('materials/meetingminutes.pdf', 'rb') as pdf_file:
  pdf_reader = pdf.PdfFileReader(pdf_file)
  pdf_writer = pdf.PdfFileWriter()
  for i in range(pdf_reader.numPages):
    pdf_writer.addPage(pdf_reader.getPage(i))
  pdf_writer.encrypt('swordfish')
  with open('encrypted_minutes.pdf', 'wb') as output:
    pdf_writer.write(output)

# Project: Combining Select Pages from Many PDFs

In [None]:
!echo 'a' > a.txt

In [None]:
!mkdir pdf_dir
!cp materials/meetingminutes.pdf pdf_dir/meetingminutes1.pdf
!cp materials/meetingminutes.pdf pdf_dir/meetingminutes2.pdf
!cp materials/meetingminutes.pdf pdf_dir/meetingminutes3.pdf
!cp materials/meetingminutes.pdf pdf_dir/meetingminutes4.pdf

In [None]:
%%writefile combine_pdfs.py
#!/usr/bin/env python
"""Combine all PDFs in a directory into one file."""
import argparse
from pathlib import Path

import PyPDF2 as pdf
from PyPDF2.utils import PdfReadError


def main():
  args = get_args()
  pdf_filenames = get_pdf_filenames(args.directory)
  file_readers = [open(filename, 'rb') for filename in pdf_filenames]
  pdf_writer = pdf.PdfFileWriter()
  combine_pdfs(file_readers, pdf_writer, args.exclude, args.skip)
  pdf_writer.write(args.output)
  for file_reader in file_readers:
    file_reader.close()
  print(f'Saved to {args.output.name}')


def get_args():
  """Get arguments from command line."""
  parser = argparse.ArgumentParser(
      description='Combine PDFs into one file'
  )
  parser.add_argument('-o',
                      '--output',
                      type=argparse.FileType('wb'),
                      help='Filename to save the combined PDF as')
  parser.add_argument('-d',
                      '--directory',
                      help='Directory to find PDFs in',
                      default='.')
  parser.add_argument('-x',
                      '--exclude',
                      nargs='+',
                      type=int,
                      help='Exclude these pages from all PDFs',
                      default=[])
  parser.add_argument('-s',
                      '--skip',
                      nargs='+',
                      type=int,
                      help='Skip these pages after the first PDF',
                      default=[])
  return parser.parse_args()


def get_pdf_filenames(directory):
  """Returna a sorted generator of all PDF filenames in the given directory."""
  return sorted(Path(directory).glob('*.pdf'))


def combine_pdfs(file_readers, writer, exclude, skip):
  """Add all the pages from the file_readers to the writer. Exclude all pages
  in exclude. And skip all pages in skip if it is not the first page."""
  for file_num, pdf_file in enumerate(file_readers):
    try:
      pdf_reader = pdf.PdfFileReader(pdf_file)
      for page_num in range(pdf_reader.numPages):
        if page_num + 1 in exclude:
          continue
        if file_num != 0 and page_num + 1 in skip:
          continue
        writer.addPage(pdf_reader.getPage(page_num))
    except PdfReadError as e:
      print(f'Failed to read {pdf_file.name}: {e}')


if __name__ == '__main__':
  main()


Overwriting combine_pdfs.py


In [None]:
!chmod +x combine_pdfs.py

In [None]:
!./combine_pdfs.py -o combined_file.pdf -d pdf_dir/ -s 1

Saved to combined_file.pdf


# Word

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 21.2 MB/s 
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184508 sha256=bb7cb18f93511867a235325c3850aa2ca1a2f8ccbd0b4a2b0f20e75f6c569e11
  Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
import docx

In [None]:
doc = docx.Document('materials/demo.docx')

In [None]:
for paragraph in doc.paragraphs:
  print(paragraph.text)

Document Title
A plain paragraph with some bold and some italic
Heading, level 1
Intense quote
first item in unordered list
first item in ordered list




In [None]:
doc.paragraphs[2].style

_ParagraphStyle('Heading 1') id: 140607321220880

## Create Word Documents

In [None]:
doc = docx.Document()
doc.add_paragraph('Hello World!')
doc.save('hello.docx')

### adding headers

In [None]:
doc.add_heading('Header 1', 1)

<docx.text.paragraph.Paragraph at 0x7fe1b1164490>

In [None]:
doc.save('hello.docx')

### Adding line/page breaks

In [None]:
doc.add_page_break()

<docx.text.paragraph.Paragraph at 0x7fe1b1186d10>

In [None]:
doc.add_heading('Page 2', 0)

<docx.text.paragraph.Paragraph at 0x7fe1b1106690>

In [None]:
doc.save('hello.docx')

### Add pictures

In [None]:
doc.add_picture('materials/zophie.png', width=docx.shared.Inches(1),
                height=docx.shared.Cm(4))

<docx.shape.InlineShape at 0x7fe1b1164090>

In [None]:
doc.save('hello.docx')

# Practice Projects

## PDF Paranoia

In [None]:
%%writefile pdf_encryption.py
#!/usr/bin/env python
"""Encrypt or decrypt all files in a directory."""
import argparse
from pathlib import Path

import PyPDF2 as pdf
from PyPDF2.utils import PdfReadError


def main():
  args = get_args()
  pdf_filenames = get_pdf_filenames(args.directory)
  file_readers = [open(filename, 'rb') for filename in pdf_filenames]
  if args.decrypt:
    output_dir = Path(f'{args.directory}_decrypted')
    output_dir.mkdir(exist_ok=True)
    decrypt_pdfs(output_dir, file_readers, args.password)
  else:
    output_dir = Path(f'{args.directory}_encrypted')
    output_dir.mkdir(exist_ok=True)
    encrypt_pdfs(output_dir, file_readers, args.password)

  for file_reader in file_readers:
    file_reader.close()
  print(f'Saved to {output_dir.name}')


def get_args():
  """Get arguments from command line."""
  parser = argparse.ArgumentParser(
      description='Encrypt or decrypt '
  )
  parser.add_argument('-p',
                      '--password',
                      help='Password to encrypt or decrypt with',
                      required=True)
  parser.add_argument('-d',
                      '--directory',
                      help='Directory to find PDFs in',
                      default='.')
  parser.add_argument('--decrypt',
                      help='Decrypt the files in the directory',
                      action='store_true')
  return parser.parse_args()


def get_pdf_filenames(directory):
  """Returna a sorted generator of all PDF filenames in the given directory."""
  return sorted(Path(directory).glob('*.pdf'))


def encrypt_pdfs(output_dir, file_readers, password):
  """Encrypt the pdfs using the password and save them in the output_dir."""
  for file_num, pdf_file in enumerate(file_readers):
    try:
      pdf_reader = pdf.PdfFileReader(pdf_file)
      writer = pdf.PdfFileWriter()
      for page_num in range(pdf_reader.numPages):
        writer.addPage(pdf_reader.getPage(page_num))
      writer.encrypt(password)
      with open(output_dir / Path(pdf_file.name).name, 'wb') as output_file:
        writer.write(output_file)
    except PdfReadError as e:
      print(f'Failed to encrypt {pdf_file.name}: {e}')


def decrypt_pdfs(output_dir, file_readers, password):
  """Decrypt the pdfs using the password and save them in the output_dir."""
  for file_num, pdf_file in enumerate(file_readers):
    try:
      pdf_reader = pdf.PdfFileReader(pdf_file)
      writer = pdf.PdfFileWriter()
      pdf_reader.decrypt(password)
      for page_num in range(pdf_reader.numPages):
        writer.addPage(pdf_reader.getPage(page_num))
      with open(output_dir / Path(pdf_file.name).name, 'wb') as output_file:
        writer.write(output_file)
    except PdfReadError as e:
      print(f'Failed to decrypt {pdf_file.name}: {e}')


if __name__ == '__main__':
  main()

Overwriting pdf_encryption.py


In [None]:
!chmod +x pdf_encryption.py

In [None]:
!./pdf_encryption.py -d materials -p swordfish

Failed to encrypt materials/encrypted.pdf: File has not been decrypted
Failed to encrypt materials/encryptedminutes.pdf: File has not been decrypted
Saved to materials_encrypted


In [None]:
!ls materials_encrypted

combinedminutes.pdf  meetingminutes2.pdf  meetingminutes.pdf  watermark.pdf


In [None]:
!./pdf_encryption.py -d materials_encrypted -p swordfish --decrypt

Saved to materials_encrypted_decrypted


In [None]:
!ls materials_encrypted_decrypted

combinedminutes.pdf  meetingminutes2.pdf  meetingminutes.pdf  watermark.pdf


## Custom Invitations as Word Documents

In [None]:
!cat materials/guests.txt

Prof. Plum
Miss Scarlet
Col. Mustard
Al Sweigart
Robocop

In [None]:
%%writefile invitations.py
#!/usr/bin/env python
"""Create invitations using a guest list."""
import argparse

import docx

def main():
  args = get_args()
  doc = docx.Document()

  for guest in args.guests:
    create_page(doc, guest.strip())
  
  doc.save('invitation.docx')
  print('saved as invitation.docx')
  

def get_args():
  """Get arguments from command line."""
  parser = argparse.ArgumentParser(
      description='Encrypt or decrypt '
  )
  parser.add_argument('guests',
                      type=argparse.FileType('r'),
                      help='Text file containing the guest names')
  return parser.parse_args()


def create_page(doc, guest):
  doc.add_heading('It would be a pleasure to have the company of', 1)
  doc.add_heading(guest, 0)
  doc.add_heading('at 11010 Memory Lane on the Evening of', 1)
  doc.add_heading('April 1st', 0)
  doc.add_heading('at 7 o\'clock', 1)
  doc.add_page_break()


if __name__ == '__main__':
  main()

Overwriting invitations.py


In [None]:
!python invitations.py materials/guests.txt

saved as invitation.docx


## Bruteforce Password Breaker

In [None]:
%%writefile pdf_decrypter.py
#!/usr/bin/env python
"""Attempt to decrypt a file using words in the dictionary."""
import argparse

import nltk
from nltk.corpus import words
import PyPDF2 as pdf
from PyPDF2.utils import PdfReadError

nltk.download('words')

DICTIONARY = set(words.words())

def main():
  args = get_args()
  key = find_key(args.file)
  if key is not None:
    print(f'Found encryption key: {key}')
  else:
    print(f'Failed to find encryption key.')


def get_args():
  """Get arguments from command line."""
  parser = argparse.ArgumentParser(
      description='Attempt to crack an encrypted PDF'
  )
  parser.add_argument('file',
                      type=argparse.FileType('rb'),
                      help='PDF file to be cracked')
  return parser.parse_args()


def find_key(file):
  """Attempt all words in the English dictionary and return the word
  if it is the encryption key."""
  for word in DICTIONARY:
    try:
      pdf_reader = pdf.PdfFileReader(file)
      pdf_reader.decrypt(word)
      pdf_reader.getPage(0).extractText()
      return word
    except PdfReadError:
      continue


if __name__ == '__main__':
  main()


Overwriting pdf_decrypter.py


In [None]:
!chmod +x pdf_decrypter.py

In [None]:
# This takes like 30 minutes
!./pdf_decrypter.py materials/encrypted.pdf

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
Found encryption key: rosebud
