### This code uses PyPDF2 to read unique identifiers directly from mortality data PDF files and uses the OS library to rename the files in-place with each corresponding identifier

In [None]:
# importing pandas (to create dataframes)
# importing OS (enables modification of filenames)
# importing PDFreader from PyPDF2 library (to read PDFs)
import pandas as pd
import os
from PyPDF2 import PdfReader


In [None]:
# get user id - this links to the {uid} filepath below so that any user in the DOHMH server can run this code
uid = !id -u
uid = uid[0]
uid

In [None]:
# Creating a list of the split mortality case files saved in the "abstraction_folder"
##  --> update link below with the folder filepath
abstractions_folder = f'/user/{uid}/file_path' #needs to be updated with the filepath
df_file_list = pd.DataFrame(os.listdir(f'{abstractions_folder}'))

In [None]:
# Renaming column 
df_file_list = df_file_list.rename(columns = {0 : 'file_names'})

In [None]:
# Creating a columns called "split" that contains the source filenames split into individual components 
# using _ as the delimiter
df_file_list['split'] = df_file_list.file_names.str.split('_')

In [None]:
# Creating a column with the year of the mortality case
df_file_list['year'] = df_file_list.split.str[2]

In [None]:
# Creating a column with the current month and year in which cases are being processed (e.g., May2022)
df_file_list['monyr_concat'] = df_file_list.split.str[-2]

In [None]:
# Reading all PDF files in the abstraction folder
# For every PDF file, reading the first 100 characters on the first page and storing characters in a space-delimited list (ME_list_start)

ME_list_start = []
ME_list_end = []
search = 'ME#'
for item in df_file_list.file_names:
    reader = PdfReader(f'{abstractions_folder}/{item}')
    page = reader.pages[0]
    ME_list_start.append(str.split(page.extract_text()[0:100]," "))

# Looking for ME numbers in the list     
for i in ME_list_start:
    for sub_i in i:
        if sub_i.find(search) != -1:
            ME_list_end.append(sub_i[3:])
    

In [None]:
# Creating a column with unique identifier number read from the pdf
df_file_list['ME_number'] = ME_list_end

In [None]:
# Creating a new column with the target filename for every abstraction, which is in the format:
# ME#_YEAR_MONTHYEAR
df_file_list['target_filename'] = (df_file_list.ME_number +
                               '_' +
                               df_file_list.year + 
                               '_' +
                               df_file_list.monyr_concat +
                               '.pdf'   
                                  )

In [None]:
# This for loop modifies the names of the original abstraction files using the TARGET file names that we stored
# df_filelist dataframe, note that it does this without rewriting the contents of the files

for count, filename in enumerate(df_file_list['file_names']):
    dst = df_file_list['target_filename'][count] # this takes the count of the file (e.g., its order) and uses that to pull out the right file name from df_filelist
    src =f"{abstractions_folder}/{filename}"  # this indicates the source file to be modified, i.e. the original abstractions
    dst =f"{abstractions_folder}/{dst}" # this sets up the new file name based on 'target_filename' in df_filelist and also ensures that it goes to the right folder
    os.rename(src, dst) # this finally does the replacement of the file names