In [290]:
import pandas as pd # For data manipulation
import unicodedata # For ensuring letter uniformity
import numpy as np 
import re # For string manipulation
from PyPDF2 import PdfReader # For reading in and handling PDFs

#### **Approach 1 (BibTex)**

In [291]:
def normalize_name(input_string):
    """ Tiny function to remove any non-standard like Á to A """
    norm_name = unicodedata.normalize('NFKD', input_string)
    return ''.join([letter for letter in norm_name if not unicodedata.combining(letter)])

def capitalize(input_string): # Capitalize first letter (like 'van' in Dutch names)
    """ Capitalize the first letter like 'van' to 'Van' to sort properly """
    return input_string[0].upper()+input_string[1:]

**Read file and get names**

In [292]:
with open('BibTeX_Names.txt', 'r') as file: # Get BibTeX names
    contents = file.read()
names = [name.strip() for name in contents.split(' and ')] # Separate by 'and'

df = pd.DataFrame(names, columns=['Name'])

print(f'There are {len(df)} names in total, some of which are non-unique')

There are 3614 names in total, some of which are non-unique


**Remove duplicates and sort (no letter conversion nor capitalization)**

In [293]:
df_copy = df.copy()
df_copy = df_copy.drop_duplicates() # Remove duplicate
df_copy = df_copy.sort_values(by=['Name']) # Sort alphabetically
print(f'There are {len(df_copy)} unique authors, meaning there were {3614-3529} non-unique names')

There are 3529 unique authors, meaning there were 85 non-unique names


In [294]:
middle_name = df_copy.iloc[len(df_copy) // 2]['Name'] # Find middle name

print(f'Given the odd number of authors, the middle-most author is {middle_name} with no changes to the names')

Given the odd number of authors, the middle-most author is Liu, J. with no changes to the names


**Remove duplicates and sort (with letter conversion and capitalization)**

In [295]:
df_copy = df.copy()
df_copy['Name'] = df_copy['Name'].apply(normalize_name) # Remove funky letters
df_copy['Name'] = df_copy['Name'].apply(capitalize) # Capitalize
df_copy = df_copy.drop_duplicates() # Remove duplicate
df_copy = df_copy.sort_values(by=['Name']) # Sort alphabetically
print(f'There are {len(df_copy)} unique authors, meaning there were {3614-3529} non-unique names')

There are 3529 unique authors, meaning there were 85 non-unique names


In [296]:
middle_name = df_copy.iloc[len(df_copy) // 2]['Name'] # Find middle name

print(f'Given the odd number of authors, the middle-most author is {middle_name} with capitalization and letter conversion')

Given the odd number of authors, the middle-most author is Link, K. with capitalization and letter conversion


#### **Approach 2 (PDF)**

In [297]:
reader = PdfReader("authors-acknowledgements-v5.pdf")

In [298]:
def reform(input_string):
    
    """ Function taking in the page and then reform it. Remove funny characters slipping through,
        remove numbers, remove stuff in parantheses, remove non-standard letters, remove 'and' 
        and reform some names that break upon reading. Names break often during reading the file, 
        so a name like 'Gustav' often becomes 'G ustav'. """
    
    string = input_string
    special_char = ['´','`','∗','˜','¨','˘','¸','ˇ','’','-','ˆ','˙']
    for char in special_char:
        string = string.replace(char,'')
    string = string.replace('\n',' ') # Remove line shift
    string = string.replace('\x03',' ') # Remove this funny thing
    string = ''.join(filter(lambda x: not x.isdigit(), string)) # Remove numbers
    string = re.sub(r"\([^()]*\)",',', string) # Remove parantheses
    string = string.replace('JR.','JR') # Special case
    string = normalize_name(string) # Remove funny letters
    string = re.sub(r'\bs\b\s*[AND]\b','SAND',string,flags=re.IGNORECASE) # Second special case of 'S AND' breaking
    string = re.sub(r'\b[AND]\b','',string) # Remove 'and'
    string = re.sub(r'(?<=\b[A-Za-z])\s(?![.])','', string) # Fixes the name-breaking by patching space
    pattern = r'[A-Za-z]\.'
    match = re.search(pattern, string) # Check for the pattern
    return string[match.start():]

def no_spaces(input_string): # Remove unnecessary space
    no_spaces = input_string.replace(' ','')
    return no_spaces

def period(input_string): # Put in punctuation after every name
    names = input_string.split() # Split
    names_ = [name+'.' if name[-1] not in ['.'] else name for name in names]
    return ''.join(names_) # Rejoin the splitted names

def stop_string(input_string, stop): 
    """ Function for stopping PDF reading at the last name """
    pos = input_string.find(stop) # Find the last name if it there
    if pos != -1: # Check if it is at the back
        return input_string[:pos+len(stop)] # Take only the page uptill the last name
    else:
        return input_string 
    
def last_names(name):
    names = list(name.split('.')) # Split names
    last_names = [last_name for last_name in names if len(last_name) > 1]
    return '.'.join(last_names) # Rejoin and get the last names only

**Loop over pages, reformat text and get dataframe**

In [299]:
dataframes = []

for i in range(0,11):
    text = reform(stop_string(reader.pages[i].extract_text(),'W OUDT')) # Read page and stop at 'W OUDT' (last page)
    names = [name.strip() for name in text.split(',')] # Separate names at ,
    df = pd.DataFrame(names, columns=['Name'])

    df.replace({'':np.nan,' ':np.nan,'  ':np.nan}, inplace=True) # Convert 'empty' cells to proper empty
    df = df.dropna(how='all') # Remove empty cells
    df['Name'] = df['Name'].apply(period) # Set punctuations
    df['Name'] = df['Name'].apply(no_spaces) # Remove spaces
    df['Last'] = df['Name'].apply(last_names) # Get last names
    
    dataframes.append(df) # Add dataframe of page to list

df = pd.concat(dataframes) # Combine all dataframe
print(f'There are {len(df)} names in total, some of which are non-unique')

There are 3612 names in total, some of which are non-unique


**Remove duplicates and sort**

In [300]:
df = df.drop_duplicates() # Remove duplicate
df = df.sort_values(by=['Last','Name']) # Sort alphabetically
print(f'There are {len(df)} unique authors, meaning there were {3612-3516} non-unique names')

There are 3516 unique authors, meaning there were 96 non-unique names


**Finding middle-most name**

In [301]:
middle_name_1 = df.iloc[len(df) // 2]['Name'] # Find middle name
middle_name_2 = df.iloc[len(df) // 2 - 1]['Name'] # Find middle name

print(f'Given the even number of authors, the middle-most authors are {middle_name_1} and {middle_name_2}')

Given the even number of authors, the middle-most authors are Q.R.LIU. and J.LIU.
