In [93]:
from PyPDF2 import PdfReader
import re
import pandas as pd
import math
import numpy as np
reader = PdfReader("authors.pdf")
number_of_pages = len(reader.pages)

**Approach 1**

In [119]:
def reform(input_string):
    string = input_string
    special_char = ['´','`','∗','˜','¨','˘','¸','ˇ','’','-','ˆ','˙']
    for char in special_char:
        string = string.replace(char,' ')
    string = ''.join(filter(lambda x: not x.isdigit(), string)) # Remove numbers
    string = string.replace('\n', '') # Remove line shifts
    string = string.replace('\x03','')
    string = re.sub(r'\([^)]*\)', ',', string) # Remove parantheses
    string = string.replace('JR.','JR')
    string = re.sub(r'\b[aA][nN][dD]\b','', string) # Remove ' and '
    string = string.replace('Ł','L')
    pattern = r'[A-Za-z]\.' # Set the pattern (from A to z)
    match = re.search(pattern, string) # Check
    if match:
        new_string = string[match.start():]
    else:
        new_string = string
    return new_string

def period(input_string): # Input periods where necessary
    def process_name(name):
        parts = name.split()
        if parts and not parts[0].endswith('.') and len(parts[0]) > 0: # Put period after first name if not present
            parts[0] += '.'
        if parts and parts[-1].endswith('.'):
            parts[-1] = parts[-1][:-1]
        return ' '.join(parts)
    names = [name.strip() for name in input_string.split(',')]
    processed_names = [process_name(name) for name in names]
    return ', '.join(processed_names)

def no_spaces(input_string): # Remove all spaces
    no_spaces = input_string.replace(' ', '')
    return no_spaces

def extract_sort(names_string): # Extract names to list and then sort them by last name
    names_list = [name.strip() for name in names_string.split(',') if name.strip()]
    def extract_names_for_sorting(name):
        parts = name.split('.')
        last_name = parts[-1]
        first_names = parts[:-1]
        return (last_name, first_names, name)
    sorted_names = sorted(names_list, key=extract_names_for_sorting)
    return sorted_names

def stop_string(input_string, substring):
    pos = input_string.find(substring)
    if pos != -1:
        return input_string[:pos + len(substring)]
    else:
        return input_string

In [120]:
all_authors = ''

for i in range(0,11):
    page = reader.pages[i]
    text = page.extract_text()
    if i == 10: # Check if it's the last page
        names_only = reform(stop_string(text,'OUDT')) # Get names
    else:
        names_only = reform(text)

    period_names = period(names_only) # Add punctation to unabbreviated first names and remove punctuation from last names
    no_space_names = no_spaces(period_names) # Remove spaces
    all_authors = all_authors + no_space_names
    
all_authors = extract_sort(all_authors)

In [121]:
all_authors_unique = [i for n, i in enumerate(all_authors) if i not in all_authors[:n]]
print(len(all_authors))
print(len(all_authors_unique))

3611
3511


In [122]:
all_authors_unique[-1]

'ANDN.ZYWUCKA'

In [123]:
print(all_authors_unique[int(len(all_authors_unique)/2)])
print(all_authors_unique[int(len(all_authors_unique)/2)-1])

T.S.LI
T.P.LI


**Approach 2**

In [106]:
with open('names.txt', 'r') as file:
    contents = file.read()
names = [name.strip() for name in contents.split(' and ')]
df = pd.DataFrame(names, columns=['Name'])
df = df.drop_duplicates()
df = df.sort_values(by=['Name'])
print(len(df['Name']))

3529


In [109]:
middle_index = len(df) // 2
if len(df) % 2 == 0:
    middle_values = df.iloc[middle_index - 1: middle_index + 1]
else:
    middle_values = df.iloc[middle_index]
print("Middle-most value(s):")
print(middle_values)

'Cardillo, M.'

In [118]:
df.iloc[len(df) // 2]

Name    Liu, J.
Name: 577, dtype: object