# PDF->CSV status
Copyright (C) 2021 ServiceNow, Inc.

This notebook allows you to investigate the status of the pdf->csv conversion for a given dataset.

You will need to change the paths to match those on your system.

In [19]:
import os
import re
import pathlib

Dataset A is split into two folders, so we have to specify a subfolder for this dataset

In [None]:
DATASET = 'D' #A B, D

#For dataset A: '' generic_pdfs_all has_pdf_dir_all  low_text_pdfs  of_pdf_all
DATASET_SUBFOLDER = '' 

## Compare pdfs with output .csvs

In [3]:
if DATASET == 'A':
    if DATASET_SUBFOLDER == "":
        PDF_DIR = '/nrcan_p2/data/01_raw/20201006/geoscan/raw/pdf'
    else:
        PDF_DIR = f"/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/{DATASET_SUBFOLDER}"
        
elif DATASET == 'B':
    PDF_DIR = "/nrcan_p2/data/01_raw/20210108"
    
elif DATASET == 'D':
    PDF_DIR = '/nrcan_p2/data/01_raw/20201221/doaj'
    
else:
    raise ValueError('Not handled')
    
pdfs = [x for x in pathlib.Path(PDF_DIR).rglob('*.pdf')]

In [4]:
len(pdfs)

3998

In [6]:
if DATASET == 'A':
    if DATASET_SUBFOLDER == "":
        CSV_DIR = f'/nrcan_p2/data/02_intermediate/20201006/geoscan/pdf/v1_all'
    else:
        CSV_DIR = f'/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/{DATASET_SUBFOLDER}'
        
elif DATASET == 'B':
    CSV_DIR = '/nrcan_p2/data/02_intermediate/20210108'
    
elif DATASET == 'D':
    CSV_DIR = '/nrcan_p2/data/02_intermediate/20201221/doaj'
    
else:
    raise ValueError('Not handled')
    
csvs = [x for x in pathlib.Path(CSV_DIR).rglob('*.csv')]

In [7]:
len(csvs)

2794

In [8]:
pdfs_rel = [pdf.relative_to(PDF_DIR) for pdf in pdfs]
pdfs_rel[0]

PosixPath('material/_2075-163X_1_1_1_pdf.pdf')

In [9]:
csvs_rel = [csv.relative_to(CSV_DIR) for csv in csvs]
csvs_rel[0]

PosixPath('material___2075-163X_1_1_1_pdf.pdfminer_split.txt.csv')

In [10]:
pdfs_base = [pathlib.Path(re.sub('/', '__', str(pdf))).stem for pdf in pdfs_rel]
pdfs_base[0]

'material___2075-163X_1_1_1_pdf'

In [11]:
csvs_base = [pathlib.Path(pathlib.Path(csv.stem).stem).stem for csv in csvs_rel]
csvs_base[0]

'material___2075-163X_1_1_1_pdf'

## Overlap

In [12]:
no_overlap = []
for i, pdf_base in enumerate(pdfs_base):
    if pdf_base not in csvs_base:
        no_overlap.append((i, pdf_base))
len(no_overlap)

1204

### Write the outputs to a file

In [16]:
fout = f'unfinished_files_{DATASET}.txt'
with open(fout, 'w') as f:
    for elem in no_overlap:
        fname = re.sub('__', '/', elem[1]) + '.pdf'
        f.write(fname + '\n')

### Group the outputs (only works for datasets with filenames starting with the file id)

In [232]:
gs = {}
for elem in no_overlap:
    g = int(elem[0]/1000) * 1000
    if g in gs:
        gs[g] += 1
    else:
        gs[g] = 1

gs

{0: 1,
 1000: 6,
 2000: 70,
 3000: 26,
 4000: 3,
 5000: 26,
 6000: 5,
 7000: 28,
 8000: 2,
 10000: 2,
 11000: 10,
 12000: 5,
 13000: 28}

### Find a particular file and its index

In [184]:
for i, pdf_base in enumerate(pdfs_base):
    if pdf_base == "ontario__lists__OFR6088":
        print(i, pdf_base)
        #2255

13287 ontario__lists__OFR6088


In [35]:
no_overlap

[(194, 'bc__geofile__BCGS_GF2019-04'),
 (1209, 'bc__paper__BCGS_P1991-01'),
 (1325, 'bc__paper__BCGS_P1993-01'),
 (1411, 'bc__paper__BCGS_P1995-03'),
 (1584, 'bc__paper__BCGS_P2000-01-05_DixonWarren'),
 (1585, 'bc__paper__BCGS_P2000-01-06_Cook'),
 (1610, 'bc__paper__BCGS_P2000-01'),
 (1621, 'bc__paper__BCGS_P2001-01-10_Ray'),
 (1639, 'bc__paper__BCGS_P2001-01'),
 (1656, 'bc__paper__BCGS_P2002-01-16_Pinset'),
 (1661, 'bc__paper__BCGS_P2002-01-21_Levson'),
 (1673, 'bc__paper__BCGS_P2002-01'),
 (1800, 'bc__paper__BCGS_P2006-01-18_Simandl'),
 (1801, 'bc__paper__BCGS_P2006-01'),
 (2032, 'bc__paper__BCGS_P2019-01-01_Ferri'),
 (2041, 'bc__paper__BCGS_P2019-01'),
 (2057, 'bc__petrolium_geoscience_publications__EOF2004-02'),
 (2212, 'bc__petrolium_geoscience_publications__OGMP2003-01'),
 (2219, 'bc__petrolium_geoscience_publications__OGR1979-01'),
 (2221, 'bc__petrolium_geoscience_publications__OGR1987-01'),
 (2223, 'bc__petrolium_geoscience_publications__OGR1992-01'),
 (2224, 'bc__petrolium_ge