# About

*Purpose:* split a concatenated PDF of applicants, as downloaded from DTU, into individual application PDFs. Also creates a table of applicant details in Tab-Separated Format (.tsv).

**Usage:** run the last cell after replacing "my_concatenated.pdf" with the path to your concatenated PDF.

*Prerequisites:*
* The string "Latest Submission Medium" must appear in the first page of each application in your concantenated PDF, and only there.
* Linux utilities: pdftotext (available by default in Ubuntu).
* Python packages: PyPDF2, pdfrw, tqdm

In [None]:
from itertools import starmap
import pathlib
from functools import partial
import sys
import os
import subprocess

from tqdm.notebook import tqdm
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfrw import PdfReader, PdfWriter

In [None]:
def get_num_pages(pdf_path):
    return PdfFileReader(pdf_path).getNumPages()

In [None]:
def get_page_text(pdf_path, page_num):
    return page_num, subprocess.check_output(("pdftotext -f %d -l %d %s -" % (page_num, page_num, pdf_path)).split(' '))

In [None]:
def find_pages_with_application_start(pdf_path):
    def is_new_app_start(args):
        return 'Latest Submission Medium' in args[1].decode('utf-8').replace('\r', '').replace('\n', '')
    
    return list(map(lambda args: args[0], 
               filter(is_new_app_start, 
                      tqdm(map(partial(get_page_text, pdf_path), range(get_num_pages(pdf_path))), 
                           desc='Page',
                           total=get_num_pages(pdf_path))
                     )
              )
    )[1:]  # Remove 0 from beginning of list (not sure why it is there)


In [None]:
def fields_to_extract_for_each_candidate():
    return ['Last Name', 'First Name', 'Year of Birth', 'City', 'Citizenship of', 'Have you been previously employed by DTU?']

In [None]:
def extract_one_application(output_dir, pdf_path, page_from, page_to):
    def extract(out_path, in_path, page_from, page_to):
        pdfwrt = PdfWriter()
        pdfrd = PdfReader(in_path)
        for page_num in range(page_from, page_to):
            pdfwrt.addpage(pdfrd.pages[page_num])
        pdfwrt.write(out_path)
    
    def get_field(field):
        for page in range(page_from, page_to):
            split_text = get_page_text(pdf_path, page)[1].decode('utf-8').split('\n')
            if field in split_text:
                return split_text[split_text.index(field) + 1]
    
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    extract(
        out_path=os.path.join(output_dir, '_'.join(map(get_field, fields_to_extract_for_each_candidate())) + '.pdf'),
        in_path=pdf_path, 
        page_from=page_from - 1, 
        page_to=page_to - 1)
    return '\t'.join(map(get_field, fields_to_extract_for_each_candidate())) + '\n'


# Example usage
# extract_one_application(output_dir='.', pdf_path='./print-4.pdf', page_from=163, page_to=168)

In [None]:
def extract_all_applications(output_dir, pdf_path, summary_tsv_path):
    pages = find_pages_with_application_start(pdf_path) + [get_num_pages(pdf_path)]
    tasks = [(output_dir, pdf_path, pages[i], pages[i + 1]) for i in range(len(pages) - 1)]
    with open(summary_tsv_path, 'w') as summary_f:
        summary_f.write('\t'.join(fields_to_extract_for_each_candidate()) + '\n')
        for candidate_fields in tqdm(starmap(extract_one_application, tasks), desc='Candidate', total=len(tasks)):
            summary_f.write(candidate_fields)

In [None]:
extract_all_applications(
    output_dir='./out', 
    pdf_path='./my_concatenated.pdf', 
    summary_tsv_path='./summary.tsv')