In [1]:
from PyPDF2 import PdfReader
import re
import math
reader = PdfReader("authors.pdf")
number_of_pages = len(reader.pages)

**Approach 1**

In [2]:
def reform(input_string):
    no_special_chars = input_string.replace('¨', '').replace('-','').replace('´', '').replace('˙','').replace('˜','').replace('`','').replace('\x03','').replace('ˇ','') # Strange characters
    no_digits = ''.join(filter(lambda x: not x.isdigit(), no_special_chars)) # Remove numbers
    no_newlines = no_digits.replace('\n', ' ') # Remove line shifts
    no_parentheses = re.sub(r'\([^)]*\)', ',', no_newlines) # Remove parantheses
    no_jr_dot = no_parentheses.replace('JR.','JR')
    no_and = re.sub(r'\b[aA][nN][dD]\b', ' ', no_jr_dot) # Remove 'and'
    pattern = r'[A-Za-z]\.' # Set the pattern (from A to z)
    match = re.search(pattern, no_and) # Check
    if match:
        trimmed_string = no_and[match.start():]
    else:
        trimmed_string = no_and
    return trimmed_string

def period(input_string): # Input periods where necessary
    def process_name(name):
        parts = name.split()
        if parts and not parts[0].endswith('.') and len(parts[0]) > 0: # Put period after first name if not present
            parts[0] += '.'
        if parts and parts[-1].endswith('.'):
            parts[-1] = parts[-1][:-1]
        return ' '.join(parts)
    names = [name.strip() for name in input_string.split(',')]
    processed_names = [process_name(name) for name in names]
    return ', '.join(processed_names)

def no_spaces(input_string): # Remove all spaces
    no_spaces = input_string.replace(' ', '')
    return no_spaces

def extract_sort(names_string): # Extract names to list and then sort them by last name
    names_list = [name.strip() for name in names_string.split(',') if name.strip()]
    def extract_names_for_sorting(name):
        parts = name.split('.')
        last_name = parts[-1]
        first_names = parts[:-1]
        return (last_name, first_names, name)
    sorted_names = sorted(names_list, key=extract_names_for_sorting)
    return sorted_names

def stop_string(input_string, substring):
    pos = input_string.find(substring)
    if pos != -1:
        return input_string[:pos + len(substring)]
    else:
        return input_string

In [3]:
#names_only = reform(stop_string(text,'OUDT')) # Just get names
#period_names = period(names_only) # Add punctation to unabbreviated first names and remove punctuation from last names
#no_space_names = no_spaces(period_names) # Remove spaces
#sorted_names = extract_sort(no_space_names) # Sorted names and add to list

In [4]:
all_authors = ''

for i in range(0,11):
    page = reader.pages[i]
    text = page.extract_text()
    
    if i == 10: # Check if it's the last page
        names_only = reform(stop_string(text,'OUDT')) # Get names
    else:
        names_only = reform(text)

    period_names = period(names_only) # Add punctation to unabbreviated first names and remove punctuation from last names
    no_space_names = no_spaces(period_names) # Remove spaces
    all_authors = all_authors + no_space_names
    
all_authors = extract_sort(all_authors)

In [5]:
all_authors_unique = [i for n, i in enumerate(all_authors) if i not in all_authors[:n]]

In [11]:
len(all_authors)

3611

In [10]:
Index = (len(all_authors_unique)-1)/2
print(all_authors_unique[math.ceil(Index)])
print(all_authors_unique[math.floor(Index)])

T.S.LI
T.P.LI


**Approach 2**