# GLANSIS - PDF Keyword Extractor
Following script will pull keywords from PDFs. Keywords will be cleaned and scientific and comman names will be added before being added to Endnote Excel sheet for bulk upload.

### Upload Libraries

In [3]:
# DO NOT EDIT
import fitz                  # open and pdf manipulation - package for PyMuPDF
import os                    # change working directory
import pandas as pd          # df manipulation
from datetime import date    # extract date for csv title output
import unicodedata           # convert extracted keywords to unicode - removes issues with duplication


### Add in local information:
- pdfs_folder_path: directory of PDFs on local drive
- setwd: location where excel is kept and location for output excel 
- ref_sheet: name of reference sheet
- names: scientific and common names

In [1]:
# Set folder paths for PDFs
pdfs_folder_path = r'C:/Users/redinger/Documents/GLANSIS_folder/Endnote_species/Nitellopsis obtusa/pdfs'

# Set working directory
setwd = r'C:/Users/redinger/Documents/GLANSIS_folder/Endnote_species/Nitellopsis obtusa'

# Name of reference file
ref_sheet = 'C:/Users/redinger/Documents/GLANSIS_folder/Endnote_species/Nitellopsis obtusa/Nitellopsis_obtusa_refs_12-12-2023.xlsx'

# Enter scientific and common names (used for keywords)
names = 'Nitellopsis obtusa, starry stonewort' 


### Create function that finds and removes keywords from PDFs:
Following function opens first two pages of PDF using PyMuPDF package (fitz.open()). Then finds and prepares keywords to be added to Excel reference sheet.

In [2]:
# DO NOT EDIT
def keyword_find(filename):
    
    # Open file
    file = fitz.open(filename)
    
    # Read and block only the first two pages of PDF
    text = []
    for i, page in enumerate(file):
        if i > 1:
            break
        text += page.get_text("blocks")
    
    # Close file
    file.close()
    
    # Find block containing keywords - if no keywords in PDF, common & scientific names used
    for block in text:
        if block[4].lower().startswith('key-words:'):
            keywords = block[4][10:].strip()
            break
        elif block[4].lower().startswith('key words:'):
            keywords = block[4][10:].strip()
            break
        elif block[4].lower().startswith('keywords:'):
            keywords = block[4][9:].strip()
            break
        elif block[4].lower().startswith('key-words'):
            keywords = block[4][9:].strip()
            break
        elif block[4].lower().startswith('key words'):
            keywords = block[4][9:].strip()
            break
        elif block[4].lower().startswith('keywords'):
            keywords = block[4][8:].strip()
            break
        else:
            keywords = names
    
    # Clean keywords
    if not keywords == names:
        
        # Remove white space         
        keyword_strip = keywords.strip() 
        
        # Replace intermediate characters - this list is not exhaustive
        keywords_replace = keyword_strip.replace('\n', ', ').replace(' � ', ', ').replace(';', ',').replace('\xa0· , ', ', ').replace('\xa0· ', ', ').replace(' . ', ', ').replace(' / ', ', ').replace(' · ', ', ')
        #keywords_replace = re.sub(r'[\n;�\xa0· /.]', ', ', keyword_strip)
        
        # Combine keywords with scientific and common names
        clean_keywords = str(names) + ', ' + str(keywords_replace)
        
    else:
        clean_keywords = keywords
    
    return(clean_keywords)
        

### Removes duplicates: 
In case combining common and scientific names with keywords creates duplicates, this function will remove and duplicate words

In [4]:
#DO NOT EDIT

# Removes duplicates
def remove_duplicates_case_insensitive(input_string):
    
    # Create an empty set to keep track of lowercase elements encountered
    lowercased_set = set()
    
    # Create an empty list to store unique elements
    unique_elements = []
    
    # Split the input string
    elements = input_string.split(", ")
    
    # Iterate over each element
    for element in elements:
        
        # Normalize the element
        normalized_element = unicodedata.normalize("NFKD", element) 
        
        # Convert the normalized element to lowercase
        lowercased_element = normalized_element.lower()             
        
        # Check if the lowercase element is not in the set of encountered lowercase elements
        if lowercased_element not in lowercased_set:
            
            # Add the lowercase element to the set
            lowercased_set.add(lowercased_element)
            
            #  Append the original element to unique elements
            unique_elements.append(element)
            
    return ", ".join(unique_elements)


### Iterate through file containing PDFs:
Keywords will be removed from each PDF and added to empty dataframe.

In [5]:
# DO NOT EDIT

# Create empty dataframe
col_names = ["file.name", "pdf.keywords"]
keywords_df = pd.DataFrame(columns = col_names)

# Pull keywords from each PDF in folder file
for filename in os.listdir(pdfs_folder_path):
    if filename.endswith(".pdf"):
        
        # Create file path
        file_path = os.path.join(pdfs_folder_path, filename)
        
        # Extract keywords
        clean_keywords = keyword_find(file_path)
        
        # Remove duplicates
        unique_keywords = remove_duplicates_case_insensitive(clean_keywords)
        
        # Create a DataFrame row
        row = pd.DataFrame([[filename, unique_keywords]], columns = ["file.name", "pdf.keywords"])
        
        # Concatenate the row DataFrame
        keywords_df = pd.concat([keywords_df, row], ignore_index=True)


In [6]:
keywords_df

Unnamed: 0,file.name,pdf.keywords
0,8863-23139-1-PB.pdf,"Nitellopsis obtusa, starry stonewort, aquatic ..."
1,Boissezon-2018.pdf,"Nitellopsis obtusa, starry stonewort, , abunda..."
2,Brzozowski-2018.pdf,"Nitellopsis obtusa, starry stonewort, Bioindic..."
3,Brzozowski-2021.pdf,"Nitellopsis obtusa, starry stonewort, aquatic ..."
4,Bučas-2019.pdf,"Nitellopsis obtusa, starry stonewort, Macrophy..."
5,Bučas-2023.pdf,"Nitellopsis obtusa, starry stonewort, submerge..."
6,Cahill-2020.pdf,"Nitellopsis obtusa, starry stonewort"
7,Carver-2022.pdf,"Nitellopsis obtusa, starry stonewort"
8,cjfas-2019-0337.pdf,"Nitellopsis obtusa, starry stonewort"
9,Escobar-2018.pdf,"Nitellopsis obtusa, starry stonewort, Aquatic ..."


### Combine with Excel reference sheet:

In [7]:
# DO NOT EDIT

# Import reference sheet with blank Keywords column
refs = pd.read_excel(ref_sheet)

# Merge reference file with extract keywords based on name of PDF files
output = refs.merge(keywords_df, how = "left", left_on = "File Attachment", right_on = "file.name")

# Replace original Keywords column and drop new columns
output["Keywords"] = output["pdf.keywords"]
output.drop(["file.name", "pdf.keywords"], axis = 1, inplace = True)

# View output
output.head()


Unnamed: 0,Author,Year,Title,Journal,Volume,Issue,Pages,URL,DOI,Abstract,Keywords,File Attachment
0,"Boissezon, A., D. A. Joye, and T. Garcia",2018,Temporal and spatial changes in population str...,Botany Letters,165,1.0,103-114,,10.1080/23818107.2017.1356239,Nitellopsis obtusa usually inhabits deep lakes...,"Nitellopsis obtusa, starry stonewort, , abunda...",Boissezon-2018.pdf
1,"Carver, P., R. M. Wersal, and B. T. Sartain",2022,Small-plot evaluations of aquatic pesticides f...,Journal of Aquatic Plant Management,60,,79-84,,,Nitellopsis obtusa (Desv.) J. Groves (starry s...,"Nitellopsis obtusa, starry stonewort",Carver-2022.pdf
2,"Escobar, L. E., S. Mallez, M. McCartney, C. Le...",2018,Aquatic invasive species in the Great Lakes re...,Reviews in Fisheries Science & Aquaculture,26,1.0,121-138,,10.1080/23308249.2017.1363715,Aquatic invasive species (AIS) are of concern ...,"Nitellopsis obtusa, starry stonewort, Aquatic ...",Escobar-2018.pdf
3,"Escobar, L. E., D. Romero-Alvarez, D. J. Larki...",2019,Network analysis to inform invasive species sp...,Journal of Oceanology and Limnology,37,3.0,1037-1041,,10.1007/s00343-019-7208-z,"Often facilitated by human-mediated pathways, ...","Nitellopsis obtusa, starry stonewort",Escobar-2019.pdf
4,"Ginn, B. K., E. F. S. Dias, and T. Fleischaker",2021,Trends in submersed aquatic plant communities ...,Lake and Reservoir Management,37,2.0,199-213,https://www.tandfonline.com/doi/full/10.1080/1...,10.1080/10402381.2020.1859025,"Ginn BK, Dias EFS, Fleischaker T. 2021. Trends...","Nitellopsis obtusa, starry stonewort, Aquatic ...",Ginn-2021.pdf


### Export excel sheet:

In [8]:
# Export new file
output.to_excel('C:/Users/redinger/Documents/GLANSIS_folder/Endnote_species/Nitellopsis obtusa/Nitellopsis_obtusa_BulkUpload' + '_' + str(date.today()) + '.xlsx', index = False)


In [16]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


