# Exercise 4 (Extra) - Extracting Author Names

In [1]:
# Import general libraries
import numpy as np
import pandas as pd

# PDF and string stuff
import pdfplumber as pdfp
import re

In [2]:
# Open the file and intitialize a pdf plumber class
path_to_file = 'authors-acknowledgements-v5.pdf'

# Get the pdf
pdf = pdfp.open(path_to_file)

## Extract the text from the pdf

In [3]:
# First we define a function that given a bounding box and page number, returns a string with the text
def pdf_to_string(page_no, bound_box):
    
    """
    Inputs:
    page_no = the page number you want to extract text from (0 indexed)
    bound_box = the bounding box to extract text from on the form (left, top, right, bottom)
    
    Returns:
    a string with all characters within the bounding box
    """
    
    # Extract the page
    page = pdf.pages[int(page_no)]
    
    # Crop page
    page_cropped = page.within_bbox( bound_box )
    
    # Get text
    return page_cropped.extract_text()

In [4]:
# Define width and height of the pages
w, h = pdf.pages[0].width, pdf.pages[0].height 

# Extract the first page, and put into a list where we will place the text from all pages
all_text = [ pdf_to_string(page_no=0, bound_box=(0, (1/6)*h, w, h) ) ]

# Extract pages from 2 to 10 (they have the same bounding box)
for i in range(9):
    all_text.append( pdf_to_string(page_no=i+1, bound_box=(0, (1/12)*h, w, h) ) )

# Extract p. 11 (smaller bounding box)
all_text.append( pdf_to_string(page_no=10, bound_box=(0, (1/12)*h, w, 0.55*h) ) )

In [5]:
# Put the strings from all pages together
main_string = ','.join(all_text)

## Clean the text from digits, spaces, extra commas, accents etc.

regular expression, forskel

In [6]:
test = 'anderson c.p.,620AND Brandt D.p.,30'
re.sub(r"[ \d](AND)", "", test) #r"(and)[ A-Z]"

'anderson c.p.,62 Brandt D.p.,30'

In [43]:
def clean_string(string):
    """
    Inputs:
    string = one string with all contents that needs to be cleaned
    
    Returns:
    a new string cleaned from: "and"s digits, parenthesis and their contents and \n - new lines
    """
    
    # Remove the word and if it occurs after a reference, i.e. a digit
    new_string = re.sub(r"[ \d](AND)", "", string)
    
    # Remove all digits (these are references in the pdf)
    new_string = ''.join(i for i in new_string if not i.isdigit())
    
    # Remove parenthesis and their contents (these are scientist groups in the pdf)
    new_string = re.sub(r"\([^()]*\)", ",", new_string)
    
    # Remove new lines symbols \n
    new_string = new_string.replace('\n','')
    
    # Remove remaint of accents, which becomes special characters when converting to string
    special_char = ['´','`','∗','˜','¨','˘','¸','ˇ','’','§','‡','†','-','ˆ','˙']
    for char in special_char:
        new_string = new_string.replace(char,'')
    
    # Change weird L to L
    new_string = new_string.replace('Ł','L')
    
    return new_string

In [44]:
# Remove anything but authors from the string
clean_main_string = clean_string(main_string)

In [45]:
#clean_main_string #uncomment to display the cleaned string

The string is now clean, except for some extra commas. And some names containing a space as the first character.

In [46]:
# Split the string into a list of string, where each element is an author
authors_list = clean_main_string.split(',')

# Remove empty elements, arising from double commas
only_authors = []

for i in range(len(authors_list)):
    
    # If not empty we keep it
    if authors_list[i] != '':
        
        # Check if the first character is a space, in that case append what comes after
        if authors_list[i][0] == ' ':
            only_authors.append(authors_list[i][1:])
        
        #Append/save the whole string if the first character is not a space
        else:
            only_authors.append(authors_list[i])

In [47]:
#only_authors #uncomment to see the list of authors

# Test for unique authors

In [48]:
# Find the number of unique authors
N_uniq = len(np.unique(only_authors))

# How many non-unique authors are there?
N_non_uniq = len(only_authors) - N_uniq

# Print results
print(f'There are {len(only_authors)} authors in the list')
print(f'There are {N_uniq} unique authors in the list.')
print(f'This means there are {N_non_uniq} non unique authors')

There are 3612 authors in the list
There are 3512 unique authors in the list.
This means there are 100 non unique authors


# Alphabetizing

We sort by last name, and then initials. So we want to scan for the last dot after an initial and place those letters after the last name. Then we can use .sort to get it alphabetized afterward.

In [49]:
# Put last name first, then space, then initials
last_names_first = []

for i in range(len(only_authors)):
    
    # Extract name, on form initial lastname
    name = only_authors[i]
    
    # Find the index of the last dot, this is where the last name stars
    index = name.rfind('.')
    
    # Move characters around so first name is first and add a space after the last name
    sorted_name = name[index+1:] + ' ' + name[:index+1]
    
    # Remove extra spaces, AGAin
    if sorted_name[0] == ' ':
        sorted_name = sorted_name[1:]
        
    # Append the sorted name
    last_names_first.append(sorted_name)

In [50]:
# Now we let python sort the alphabetically
sorted_authors = sorted(last_names_first)

In [55]:
#sorted_authors #uncomment to see the sorted list

In [16]:
# Find the author at location (total_authors)/2
print(f'There is an even number ({len(sorted_authors)}) of sorted authors. \
      Thus we will find one author in this position')

# Find the indices
print('The middle author is', sorted_authors[int(len(sorted_authors)/2)] )

# Due to zero indexing we should probably also include
print('The other middle author is', sorted_authors[int(len(sorted_authors)/2)-1] )

There is an even number (3612) of sorted authors.       Thus we will find one author in this position
The middle author is LI C.K.
The other middle author is LI B.


In [17]:
### SORT BY INITIAL FOR FUN ###


# Find the author at location (total_authors)/2
print(f'There is an even number ({len(only_authors)}) of sorted authors. \
      Thus we will find one author in this position')

# Find the indices
print('The middle author is', sorted(only_authors)[int(len(only_authors)/2)] )

# Due to zero indexing we should probably also include
print('The other middle author is', sorted(only_authors)[int(len(only_authors)/2)-1] )

There is an even number (3612) of sorted authors.       Thus we will find one author in this position
The middle author is K.HIDAKA
The other middle author is K.HERNER
