# Preprocessing Cleaning pipeline status
Copyright (C) 2021 ServiceNow, Inc.

This notebook outlines the status of the dataprocessing cleaning pipeline for a given dataset.

You will need to change the dataset folders to reflect your system.

In [3]:
import os
import re
import pathlib
import pandas as pd

Specify the dataset you wish to examine:

In [38]:
DATASET = "A_full"

You will need to alter these paths to reflect your system:

In [39]:
if DATASET == "A_full":
    PDF_DIRS=(
        '/nrcan_p2/data/01_raw/20201006/geoscan/raw/pdf',
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/generic_pdfs',
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/has_pdf_dir',
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/of_pdf',
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/low_text_pdfs',
    )
    TXT_DIRS=(
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/txt',

    )
    WPF_DIRS=(
        '/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/wp_rtf',    
    )    
    #geochemical_perspective_letters  geosciences  material	quaternary  solid_earth
elif DATASET == "B":
    PDF_DIRS = ()
    TXT_DIRS = ()
    WPF_DIRS = ()
    
elif DATASET == "D":
    PDF_DIRS = (
        '/nrcan_p2/data/01_raw/20201221/doaj/geochemical_perspective_letters',
        '/nrcan_p2/data/01_raw/20201221/doaj/geosciences',
        '/nrcan_p2/data/01_raw/20201221/doaj/material',
        '/nrcan_p2/data/01_raw/20201221/doaj/quaternary',
        '/nrcan_p2/data/01_raw/20201221/doaj/solid_earth'    
    )
    TXT_DIRS = ()
    WPF_DIRS = ()    


In [45]:
if DATASET == "A_full":
    INPUT_DIRS=(
        '/nrcan_p2/data/02_intermediate/20201006/geoscan/pdf/v1_all',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/generic_pdfs_all',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/has_pdf_dir_all',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/of_pdf_all',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/low_text_pdfs',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/txt',
        '/nrcan_p2/data/02_intermediate/20201117/geoscan/wp_rtf'    
    )
elif DATASET == "B":
    pass
elif DATASET == "D":
    INPUT_DIRS=(    
        '/nrcan_p2/data/02_intermediate/20201221/doaj',
    )

In [46]:
if DATASET == "A_full":
    OUTPUT_DIR = '/nrcan_p2/data/03_primary/v4/'
elif DATASET == "B":
    pass
elif DATASET == "D":
    OUTPUT_DIR = '/nrcan_p2/data/03_primary/v4_D/'

## Compile the raw pdfs...

In [47]:
raw_files = []
for input_dir in PDF_DIRS:
    nfiles = [x for x in pathlib.Path(input_dir).glob('*.pdf')]
    print(len(nfiles))
    raw_files.extend(nfiles)
    
print(len(raw_files))

7183
5424
411
1159
12626
26803


In [48]:
import re
raw_files = []
for input_dir in TXT_DIRS:
    print(input_dir)
    nfiles = [x.name for x in pathlib.Path(input_dir).glob('*.txt') if re.search('[0-9]', str(x.name)[0])]
    print(nfiles[0:10])
    print(len(nfiles))
    raw_files.extend(nfiles)
    nfiles = [x.name for x in pathlib.Path(input_dir).glob('*.TXT') if re.search('[0-9]', str(x.name)[0])]
    print(nfiles[0:10])
    print(len(nfiles))    
    
    raw_files.extend(nfiles)
    
print(len(raw_files))

/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/txt
['133235_OF_2479.txt', '133472_OF2506.txt', '209021_D3449.txt', '210082_OFTEXT.txt', '210204_Read Me First.txt', '210231_abstract.txt', '210231_Citation.txt', '210231_Doc.txt', '211478_Text D3594d.txt', '212287_Ctrlk.txt']
88
['130913_CHEM.TXT', '130913_FIELD.TXT', '130913_INTRO.TXT', '133235_YKPLUTON.TXT', '194084_92G1LEG.TXT', '194084_92G2LEG.TXT', '194084_92G3LEG.TXT', '194084_92G6LEG.TXT', '194084_92G7LEG.TXT', '194084_92H4LEG.TXT']
66
154


In [49]:
raw_files = []
for input_dir in WPF_DIRS:
    print(input_dir)
    files = [x for x in pathlib.Path(input_dir).glob('*.rtf')]
    #print(files)
    raw_files.extend(files)
    #print(pathlib.Path(input_dir)[0:10])
print(len(raw_files))

/nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/wp_rtf
61


## Compile the csvs...

In [50]:
csvs = []
for input_dir in INPUT_DIRS:
    print(input_dir)
    csvs.extend([x for x in pathlib.Path(input_dir).glob('*.csv')])
    
print(len(csvs))

/nrcan_p2/data/02_intermediate/20201006/geoscan/pdf/v1_all
/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/generic_pdfs_all
/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/has_pdf_dir_all
/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/of_pdf_all
/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/low_text_pdfs
/nrcan_p2/data/02_intermediate/20201117/geoscan/txt
/nrcan_p2/data/02_intermediate/20201117/geoscan/wp_rtf
21787


In [51]:
df = pd.DataFrame({'csv_file': csvs})

In [52]:
df['csv_filename'] = df.csv_file.apply(lambda x: x.name)
df

Unnamed: 0,csv_file,csv_filename
0,/nrcan_p2/data/02_intermediate/20201006/geosca...,100327.pdfminer_split.csv
1,/nrcan_p2/data/02_intermediate/20201006/geosca...,100328.pdfminer_split.csv
2,/nrcan_p2/data/02_intermediate/20201006/geosca...,100331.pdfminer_split.csv
3,/nrcan_p2/data/02_intermediate/20201006/geosca...,100335.pdfminer_split.csv
4,/nrcan_p2/data/02_intermediate/20201006/geosca...,100337.pdfminer_split.csv
...,...,...
21782,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_Plasticity.csv
21783,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_Pollen.csv
21784,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_RockEval.csv
21785,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_conductivity.csv


## Investigate a particular data pipeline

Specify the cleaning pipeline that you wish to examine by providing a config file for the partially completed process

In [55]:
#config_file = '/nrcan_p2/data/03_primary/v4/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dA_full_v1.config'
#all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dA_full_v1.config'

#config_file = #'/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1.config'
#config_file = '/nrcan_p2/data/03_primary/v4_D/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dD_v1.config'

config_file = '/nrcan_p2/data/03_primary/v4/all_text_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_dA_full_v1_partial.config'

In [56]:
import yaml
with open(config_file, 'r') as f:
    config = yaml.load(f)

config

  config = yaml.load(f)


{'input_dirs': ['/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/low_text_pdfs'],
 'n_files': -1,
 'output_dir': '/nrcan_p2/data/03_primary/v4',
 'output_file': '/nrcan_p2/data/03_primary/v4/all_text_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_dA_full_v1_partial.txt',
 'postprocessing_functions': ['rm_punct', 'lower', 'rm_newline'],
 'postprocessing_functions_mapped': [17, 18, 21],
 'postprocessing_pipeline': 'POSTPIPE_GLOVE',
 'preprocessing_functions': ['rm_dbl_space',
  'rm_cid',
  'convert_to_ascii',
  'rm_nonprintable',
  'filter_no_letter',
  'rm_newline_hyphenation',
  'rm_newline',
  'filter_no_real_words_g3letter',
  'filter_with_email',
  'rm_url',
  'rm_doi',
  'filter_with_phonenumber',
  'filter_non_english',
  'filter_l80_real_words',
  'tokenize_spacy_lg',
  'rm_stopwords_spacy'],
 'preprocessing_functions_mapped': [0,
  1,
  2,
  3,
  4,
  6,
  14,
  16,
  26,
  27,
  28,
  29,
  30,
  31,
  22,
  23],
 'preprocessing_pipeline': 'PIPELINE_GLOVE_80',
 'suffix': 'PIPELINE_GLOVE_

In [57]:
from collections import defaultdict

files = defaultdict(list)
for i in range(len(config['preprocessing_functions_mapped'])):
    preprocessing_substeps = '__'.join([str(x) for x in config['preprocessing_functions_mapped'][0:i+1]])
    p = pathlib.Path(config['output_dir']) / str(preprocessing_substeps)
    print(p)
    
    folder = p.name
    print(folder)
    
    for file in p.iterdir():
        #print(file.stem)
        #fname = file.parent / (file.stem + '__' + preprocessing_substeps + file.suffix)
        temp_name = file.stem
        base_name = pathlib.Path(temp_name).stem
        suffix = pathlib.Path(temp_name).suffix.split('__')
        #base_name = pathlib.Path(file.stem).stem + file.suffix
        orig_csv = base_name + suffix[0] + file.suffix
        #print(orig_csv)
        files[folder].append(orig_csv)
        
pre_folder = folder
for i in range(len(config['postprocessing_functions_mapped']) + 1):
    post_list = [''] + [str(x) for x in config['postprocessing_functions_mapped']]
    preprocessing_substeps = pre_folder + '__POST' + '__'.join(post_list[0:i+1])
    p = pathlib.Path(config['output_dir']) / str(preprocessing_substeps)
    print(p)
    
    folder = p.name
    print(folder)
    
    for file in p.iterdir():
        #print(file.stem)
        #fname = file.parent / (file.stem + '__' + preprocessing_substeps + file.suffix)
        temp_name = file.stem
        base_name = pathlib.Path(temp_name).stem
        suffix = pathlib.Path(temp_name).suffix.split('__')
        #base_name = pathlib.Path(file.stem).stem + file.suffix
        orig_csv = base_name + suffix[0] + file.suffix
        #print(orig_csv)
        files[folder].append(orig_csv)
        #print(orig_csv)
    

/nrcan_p2/data/03_primary/v4/0
0
/nrcan_p2/data/03_primary/v4/0__1
0__1
/nrcan_p2/data/03_primary/v4/0__1__2
0__1__2
/nrcan_p2/data/03_primary/v4/0__1__2__3
0__1__2__3
/nrcan_p2/data/03_primary/v4/0__1__2__3__4
0__1__2__3__4
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6
0__1__2__3__4__6
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14
0__1__2__3__4__6__14
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16
0__1__2__3__4__6__14__16
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16__26
0__1__2__3__4__6__14__16__26
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16__26__27
0__1__2__3__4__6__14__16__26__27
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16__26__27__28
0__1__2__3__4__6__14__16__26__27__28
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16__26__27__28__29
0__1__2__3__4__6__14__16__26__27__28__29
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__14__16__26__27__28__29__30
0__1__2__3__4__6__14__16__26__27__28__29__30
/nrcan_p2/data/03_primary/v4/0__1__2__3__4__6__

In [58]:
for elem in files:
    files[elem] = pd.DataFrame({elem: files[elem]})

In [20]:
files['0']

Unnamed: 0,0
0,material___2075-163X_6_2_56_pdf.pdfminer_split...
1,geosciences___2076-3263_8_4_131_pdf.pdfminer_s...
2,solid_earth___3_313_2012_se-3-313-2012.pdf.pdf...
3,geosciences___2076-3263_9_5_243_pdf.pdfminer_s...
4,solid_earth___3_111_2012_se-3-111-2012.pdf.pdf...
...,...
3987,material___2075-163X_9_4_245_pdf.pdfminer_spli...
3988,material___2075-163X_6_2_55_pdf.pdfminer_split...
3989,material___2075-163X_7_9_170_pdf.pdfminer_spli...
3990,material___2075-163X_9_4_246_pdf.pdfminer_spli...


In [59]:
m = df
for elem in files:
    m = pd.merge(m, files[elem], left_on='csv_filename', right_on=elem, how='left')

m

Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21
0,/nrcan_p2/data/02_intermediate/20201006/geosca...,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,...,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,100327.pdfminer_split.csv,,,,
1,/nrcan_p2/data/02_intermediate/20201006/geosca...,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,...,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,100328.pdfminer_split.csv,,,,
2,/nrcan_p2/data/02_intermediate/20201006/geosca...,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,...,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,100331.pdfminer_split.csv,,,,
3,/nrcan_p2/data/02_intermediate/20201006/geosca...,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,...,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,100335.pdfminer_split.csv,,,,
4,/nrcan_p2/data/02_intermediate/20201006/geosca...,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,...,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,100337.pdfminer_split.csv,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21782,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_Plasticity.csv,,,,,,,,,...,,,,,,,,,,
21783,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_Pollen.csv,,,,,,,,,...,,,,,,,,,,
21784,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_RockEval.csv,,,,,,,,,...,,,,,,,,,,
21785,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_conductivity.csv,,,,,,,,,...,,,,,,,,,,


In [60]:
m['csv_path'] = m.csv_file.apply(lambda x: x.parent)


## Explore the outputs... 

In [61]:
c = m.groupby('csv_path')

for ngroup, group in c:
    print(ngroup)
    display(group.describe())
#c[~c.isnull()]
c

/nrcan_p2/data/02_intermediate/20201006/geoscan/pdf/v1_all


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,7008,7008,5806,5806,5806,5806,5806,5594,5594,5594,...,5569,5569,5516,5501,5501,0.0,0.0,0.0,0.0,7008
unique,7008,7008,5806,5806,5806,5806,5806,5594,5594,5594,...,5569,5569,5516,5501,5501,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201006/geosca...,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,...,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,298069.pdfminer_split.csv,,,,,/nrcan_p2/data/02_intermediate/20201006/geosca...
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,,,,,7008


/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/generic_pdfs_all


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,5279,5279,4924,4922,4922,4922,4922,4900,4900,4900,...,4861,4861,4824,4792,4792,0.0,0.0,0.0,0.0,5279
unique,5279,5279,4924,4922,4922,4922,4922,4900,4900,4900,...,4861,4861,4824,4792,4792,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,...,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,101426_pa_68_44.pdfminer_split.csv,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,,,,,5279


/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/has_pdf_dir_all


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,411,411,404,404,404,404,404,404,404,404,...,404,404,404,404,404,0.0,0.0,0.0,0.0,411
unique,411,411,404,404,404,404,404,404,404,404,...,404,404,404,404,404,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,...,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,106037_eg_31_10c.pdfminer_split.csv,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,,,,,411


/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/low_text_pdfs


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,7886,7886,252,252,252,252,252,185,185,185,...,176,176,176,171,171,0.0,0.0,0.0,0.0,7886
unique,7886,7886,252,252,252,252,252,185,185,185,...,176,176,176,171,171,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,130662_of_1990_line1_FS_19MAY1987.pdfminer_spl...,130584_of_1959_maps.pdfminer_split.csv,130584_of_1959_maps.pdfminer_split.csv,130584_of_1959_maps.pdfminer_split.csv,130584_of_1959_maps.pdfminer_split.csv,130584_of_1959_maps.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,...,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,129146_of_0054_line_06.pdfminer_split.csv,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,,,,,7886


/nrcan_p2/data/02_intermediate/20201117/geoscan/pdf/of_pdf_all


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,1159,1159,788,788,788,788,788,788,788,788,...,785,785,785,785,785,0.0,0.0,0.0,0.0,1159
unique,1159,1159,788,788,788,788,788,788,788,788,...,785,785,785,785,785,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,...,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,129229_of_0425.pdfminer_split.csv,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,,,,,1159


/nrcan_p2/data/02_intermediate/20201117/geoscan/txt


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,35,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
unique,35,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,210114_RADIO.csv,,,,,,,,,...,,,,,,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,,,,,,,,,...,,,,,,,,,,35


/nrcan_p2/data/02_intermediate/20201117/geoscan/wp_rtf


Unnamed: 0,csv_file,csv_filename,0,0__1,0__1__2,0__1__2__3,0__1__2__3__4,0__1__2__3__4__6,0__1__2__3__4__6__14,0__1__2__3__4__6__14__16,...,0__1__2__3__4__6__14__16__26__27__28__29,0__1__2__3__4__6__14__16__26__27__28__29__30,0__1__2__3__4__6__14__16__26__27__28__29__30__31,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18,0__1__2__3__4__6__14__16__26__27__28__29__30__31__22__23__POST__17__18__21,csv_path
count,9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
unique,9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
top,/nrcan_p2/data/02_intermediate/20201117/geosca...,210944_tbl_Foraminifera.csv,,,,,,,,,...,,,,,,,,,,/nrcan_p2/data/02_intermediate/20201117/geosca...
freq,1,1,,,,,,,,,...,,,,,,,,,,9


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9e2d65f2b0>

In [64]:
m[m.csv_filename.str.contains('119985.pdfminer_split.csv')].transpose()

Unnamed: 0,1143
csv_file,/nrcan_p2/data/02_intermediate/20201006/geosca...
csv_filename,119985.pdfminer_split.csv
0,119985.pdfminer_split.csv
0__1,119985.pdfminer_split.csv
0__1__2,119985.pdfminer_split.csv
0__1__2__3,119985.pdfminer_split.csv
0__1__2__3__4,119985.pdfminer_split.csv
0__1__2__3__4__6,119985.pdfminer_split.csv
0__1__2__3__4__6__14,119985.pdfminer_split.csv
0__1__2__3__4__6__14__16,119985.pdfminer_split.csv


In [25]:
display(m.shape[0])
m.isna().sum()

3992

csv_file                                                                                 0
csv_filename                                                                             0
0                                                                                        0
0__1                                                                                     0
0__1__2                                                                                  0
0__1__2__3                                                                               0
0__1__2__3__4                                                                            0
0__1__2__3__4__5                                                                         0
0__1__2__3__4__5__6                                                                      0
0__1__2__3__4__5__6__7                                                                   0
0__1__2__3__4__5__6__7__8                                                                0

In [26]:
for file in pathlib.Path(OUTPUT_DIR).iterdir():
    print(file)

/nrcan_p2/data/03_primary/v4_D/PROCESSING_MAP.json.lock
/nrcan_p2/data/03_primary/v4_D/PROCESSING_MAP.json
/nrcan_p2/data/03_primary/v4_D/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dD_v1_partial.config.lock
/nrcan_p2/data/03_primary/v4_D/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dD_v1_partial.config
/nrcan_p2/data/03_primary/v4_D/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dD_v1_partial.log.lock
/nrcan_p2/data/03_primary/v4_D/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dD_v1_partial.log
/nrcan_p2/data/03_primary/v4_D/0
/nrcan_p2/data/03_primary/v4_D/0__1
/nrcan_p2/data/03_primary/v4_D/0__1__2
/nrcan_p2/data/03_primary/v4_D/0__1__2__3
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4__5
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4__5__6
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4__5__6__7
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4__5__6__7__8
/nrcan_p2/data/03_primary/v4_D/0__1__2__3__4__5__6__7__8__9
/nrcan_p2/data/03_primar

In [27]:
files['0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__22__23__POST']
#      0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__22__23__POST

[]

## Count the outputs from different stages of the pipeline...

In [30]:
total_counts = []

#pipestep = '0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__22__23__POST'
pipestep = '0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__20__21__POST'
pipestep = '0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__17__18__POST__19__20__21'
#/nrcan_p2/data/03_primary/v4_B
for file in (pathlib.Path('/nrcan_p2/data/03_primary/v4_D/') / pipestep).iterdir():
    with open(file, 'r') as f:
        text = f.read()
        text = text.split()
        total_counts.append(len(text))

In [31]:
orig_counts = []
pipestep = '0__1'
for file in (pathlib.Path('/nrcan_p2/data/03_primary/v4_D/') / '0__1').iterdir():
    df_csv = pd.read_csv(file)
    text = df_csv.processed_text.astype(str).str.split()
    text = text.str.len()
    text = text.sum()
    orig_counts.append(text)

In [32]:
len(total_counts)

3992

In [33]:
len(orig_counts)

3992

In [34]:
print(sum(total_counts))

23023310


In [35]:
print(sum(orig_counts))

34506472


In [37]:
23023310/34506472 <-- similar to ontario actually

0.6672171527706454