This notebook is used to clean up the old seams pdf directory so that the pdfs have a consistent structure and naming, as well as match the metadata with minimal special characters etc that need to be dealt with when acessing the pdfs programatically. 

Everything works well except there seems to be one less pdf file in the output folder than the input folder. I cannot figure out what this pdf file is and it is difficult as the titles of the pdfs are changed. 

In [1]:

import pandas as pd

import shutil
import os

seams_pdf_folder = r'C:\Users\aspit\OneDrive\MHD NLP\Updated SEAMs'

seams_output_folder = r'C:\Users\aspit\Documents\test_seams'


In [2]:

df_meta = pd.read_csv(r'C:\Users\aspit\Git\MHDLab-Projects\NLP_MHD\data\SEAMS_metadata.csv', index_col=['ID'])

filepaths_original = df_meta.dropna(subset=['Filepath'])['Filepath']
filenames_original = [os.path.split(fp)[1].lower() for fp in filepaths_original]

print('scanning original directory')
original_files = []

for root, dirs, files in os.walk(seams_pdf_folder, topdown=False):
    for file in files:
        ext = os.path.splitext(file)[1]
        if ext == '.pdf' or ext=='.PDF':
            original_files.append(file.lower())
            if file.lower() not in filenames_original:
                print("File not in original filenames: " + file)
        else:
            print('Non pdf file found: ' + str(file))

print("pdfs in input dir: ", len(original_files))



scanning original directory
Non pdf file found: Bad scan info.docx
Non pdf file found: SEAMS Inventory Original.xlsx
pdfs in input dir:  1654


In [3]:

#Processing
df_meta['Title'] = df_meta['Title'].apply(str.strip).apply(str.title)
df_meta['Title'] = [s.replace('Mhd', 'MHD') for s in df_meta['Title']]
df_meta['Session Name'] = df_meta['Session Name'].apply(str.strip)
df_meta['SEAM'] = df_meta['SEAM'].apply(int)

#Downselect for debugging
# df_meta = df_meta[df_meta['SEAM'] == 21]
# df_meta = df_meta.where(df_meta['Session Name'] == 'Intro Materials').dropna(subset=['Session Name'])

df_meta


Unnamed: 0_level_0,SEAM,Session Name,Title,Author(s),Filepath,File Info
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,Diagnostics and Communications,Introduction,D. R. Whitehouse,3rd (Third) SEAM/1st (Session I) - System Desi...,OK
2,3,Diagnostics and Communications,Emission Of Microwaves From Plasma,George Bekefi,3rd (Third) SEAM/1st (Session I) - System Desi...,OK
3,3,Diagnostics and Communications,Nonthermal Radiation From Jupiter In The Decam...,"Alex G. Smith, T. D. Carr, and N. F. Six",3rd (Third) SEAM/1st (Session I) - System Desi...,OK
4,3,Diagnostics and Communications,The Measurement Of Shaped Probing Beams In Mic...,Paul H. Wolfert,3rd (Third) SEAM/1st (Session I) - System Desi...,OK
5,3,Diagnostics and Communications,A Versatile Microwave Diagnostic System For Pl...,F. J. F. Osborne,3rd (Third) SEAM/1st (Session I) - System Desi...,OK
...,...,...,...,...,...,...
1785,34,Diagnostics 'B',Optical Diagnostics Of Particles In High-Tempe...,"Olga S. Vaulina, Anatoli P. Nefedov, Oleg F. P...",34th (Thirty-Fourth) SEAM/8th (Session VIIIb) ...,OK
1786,34,Diagnostics 'B',Optical And Probe Diagnostics Of Charged Dust ...,"Olga S. Vaulina, Anatoli P. Nefedov, Oleg F. P...",34th (Thirty-Fourth) SEAM/8th (Session VIIIb) ...,OK
1787,34,Diagnostics 'B',Three-Dimensional Flow Visualization By Means ...,"S. V. Yakshin, I. V. Tikhonov, and V. M. Zubtsov",34th (Thirty-Fourth) SEAM/8th (Session VIIIb) ...,OK
1788,34,Addendum,Space Power-Propulsion Plant On MHD Generator ...,"V. S. Slavin, P. A. Zakharov, K. A. Finnikov, ...",34th (Thirty-Fourth) SEAM/9th (Session IX) - A...,OK


In [4]:

filename_stopwords = ["\'","\"", ".",",",":", "?" , "/"]

rel_filepaths_out = []

filepaths_new = pd.Series(index=df_meta.index, dtype=str)

for index, row in df_meta.dropna(subset=['Filepath']).iterrows():
    
    rel_fp_in = row['Filepath']

    fp_in = os.path.join(seams_pdf_folder, rel_fp_in)
    fp_in = "\\\\?\\" + fp_in.replace('/', '\\')

    rel_folder_out = 'SEAM ' + str(row['SEAM']) + '\\' + row['Session Name']
    
    rel_filepath_out = rel_folder_out + '\\' + row['Title']

    for stopword in filename_stopwords:
        rel_filepath_out = rel_filepath_out.replace(stopword, "")


    #avoid problems with long paths...could cause issues if filename was totally
    #removed but in that case an error would probably be good technically should
    #only need to remove 4 characters beyond root dir but remove more for good
    #measure
    max_rel_fp_length = 260 - len(seams_pdf_folder) - 10

    rel_filepath_out = rel_filepath_out[:max_rel_fp_length]

    rel_filepath_out = rel_filepath_out + '.pdf'

    filepaths_new.loc[index] = rel_filepath_out

    rel_filepaths_out.append(rel_filepath_out)

    fp_out = os.path.join(seams_output_folder, rel_filepath_out)
    
    
    fp_out = "\\\\?\\" + fp_out.replace('/', '\\')

    folder_out = os.path.split(fp_out)[0]
    if not os.path.exists(folder_out): os.makedirs(folder_out)

    shutil.copy(fp_in, fp_out)

    # break

df_meta['Filepath'] = filepaths_new


In [5]:



print('scanning new directory')
new_files = []

for root, dirs, files in os.walk(seams_output_folder, topdown=False):
    for file in files:
        ext = os.path.splitext(file)[1]
        if ext == '.pdf' or ext=='.PDF':
            new_files.append(file.lower())
        else:
            print('Non pdf file found: ' + str(file))
print("pdfs in output dir: ", len(new_files))


scanning new directory
pdfs in output dir:  1653


In [6]:

meta_filepath_out = os.path.join(seams_output_folder, 'SEAMS_metadata.csv')

df_meta.to_csv(meta_filepath_out)


In [7]:

#Metadata integrity check

print('Checking output metadata integrity')
df_metadata_new = pd.read_csv(meta_filepath_out)

missing_filepaths = []

for filepath in df_metadata_new.dropna(subset=['Filepath'])['Filepath']:
    filepath = os.path.join(seams_output_folder, filepath)

    if not os.path.exists(filepath):
        missing_filepaths.append(filepath)


print('Missing filepaths')
print(missing_filepaths)

Checking output metadata integrity
Missing filepaths
[]
