# Dataset Cleaning Pipeline Overview
Copyright (C) 2021 ServiceNow, Inc.

This notebook provides a quick overview of a given data file (output of the preprocessing pipeline), 
including the size of the output vocab, the number of steps in the pipeline etc.

Just change the paths in the [##Setup] section, and run the notebook. 

## Setup

In [80]:
PREPROCESSING_DIR="/nrcan_p2/data/03_primary/v4_B/"
PREPROCESSING_VERSION="all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1"

## Load data

In [81]:
import pathlib
import yaml
import re
import collections
import pandas as pd

In [82]:
PREPROCESSING_DIR = pathlib.Path(PREPROCESSING_DIR)

CONFIG_FILE = PREPROCESSING_DIR / (PREPROCESSING_VERSION + ".config")
LOG_FILE = PREPROCESSING_DIR / (PREPROCESSING_VERSION + ".log")
SOURCE_FILE = PREPROCESSING_DIR / (PREPROCESSING_VERSION + "_source.csv")
OUTPUT_FILE = PREPROCESSING_DIR / (PREPROCESSING_VERSION + ".txt")

print(CONFIG_FILE)
print(LOG_FILE)
print(SOURCE_FILE)
print(OUTPUT_FILE)

/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1.config
/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1.log
/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1_source.csv
/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1.txt


In [83]:
with open(CONFIG_FILE, 'r') as f:
    config = yaml.load(f)

  config = yaml.load(f)


In [84]:
config

{'input_dirs': ['/nrcan_p2/data/02_intermediate/20210108'],
 'n_files': -1,
 'output_dir': '/nrcan_p2/data/03_primary/v4_B',
 'output_file': '/nrcan_p2/data/03_primary/v4_B/all_text_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_dB_v1.txt',
 'postprocessing_functions': ['rm_punct', 'lower', 'rm_newline'],
 'postprocessing_functions_mapped': [22, 23, 17],
 'postprocessing_pipeline': 'POSTPIPE_GLOVE',
 'preprocessing_functions': ['rm_dbl_space',
  'rm_cid',
  'convert_to_ascii',
  'rm_nonprintable',
  'filter_no_letter',
  'rm_word_all_punct',
  'rm_newline_hyphenation',
  'rm_beg_end_word_punct',
  'rm_punct_mid_punct',
  'strip_space',
  'filter_l2_word',
  'filter_l4_letter',
  'rm_mid_word_punct',
  'rm_non_textual_punct',
  'rm_newline',
  'merge_words',
  'filter_no_real_words_g3letter',
  'tokenize_spacy_lg',
  'rm_stopwords_spacy'],
 'preprocessing_functions_mapped': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  20,
  21],
 'preprocessing_pi

In [85]:
PREPIPE = config['preprocessing_pipeline']
POSTPIPE = config['postprocessing_pipeline']

PREPIPE_FUNCTIONS = config['preprocessing_functions']
PREPIPE_FUNCTIONS_MAPPED = config['preprocessing_functions_mapped']

POSTPIPE_FUNCTIONS = config['postprocessing_functions']
POSTPIPE_FUNCTIONS_MAPPED = config['postprocessing_functions_mapped']

## Input Data

In [86]:
INPUT_DATA_FOLDERS = config['input_dirs']
INPUT_DATA_FOLDERS

['/nrcan_p2/data/02_intermediate/20210108']

In [87]:
csvs = []
for folder in INPUT_DATA_FOLDERS:
    csvs.extend([x for x in pathlib.Path(folder).glob('*.csv')])

In [88]:
len(csvs)

13686

## Partial outputs - preprocessing

In [89]:
preprocessed_files = {}
for i, (preprocessing_func, preprocessing_func_mapped) in enumerate(zip(PREPIPE_FUNCTIONS, PREPIPE_FUNCTIONS_MAPPED)):
    print(preprocessing_func, preprocessing_func_mapped)
    folder = pathlib.Path(config['output_dir']) / '__'.join([str(x) for x in PREPIPE_FUNCTIONS_MAPPED[0:i+1]])
    print(folder)
    
    files = [x for x in pathlib.Path(folder).glob('*.csv')]
    print(len(files))
    preprocessed_files[i] = files

rm_dbl_space 0
/nrcan_p2/data/03_primary/v4_B/0
11747
rm_cid 1
/nrcan_p2/data/03_primary/v4_B/0__1
11747
convert_to_ascii 2
/nrcan_p2/data/03_primary/v4_B/0__1__2
11747
rm_nonprintable 3
/nrcan_p2/data/03_primary/v4_B/0__1__2__3
11747
filter_no_letter 4
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4
11747
rm_word_all_punct 5
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5
11723
rm_newline_hyphenation 6
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6
11723
rm_beg_end_word_punct 7
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7
11723
rm_punct_mid_punct 8
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8
11723
strip_space 9
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9
11723
filter_l2_word 10
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10
11723
filter_l4_letter 11
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10__11
11720
rm_mid_word_punct 12
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10__11__12
11720
rm_non_tex

In [90]:
postprocessed_files = {}
for i, (postprocessing_func, postprocessing_func_mapped) in enumerate(zip(POSTPIPE_FUNCTIONS, POSTPIPE_FUNCTIONS_MAPPED)):
    print(postprocessing_func, postprocessing_func_mapped)
    
    prepipe_folder = '__'.join([str(x) for x in PREPIPE_FUNCTIONS_MAPPED]) + '__POST__'
    folder = pathlib.Path(config['output_dir']) / (prepipe_folder + '__'.join([str(x) for x in POSTPIPE_FUNCTIONS_MAPPED[0:i+1]]))
    print(folder)
    
    files = [x for x in pathlib.Path(folder).glob('*.txt')]
    print(len(files))
    postprocessed_files[i] = files

rm_punct 22
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__20__21__POST__22
11718
lower 23
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__20__21__POST__22__23
11718
rm_newline 17
/nrcan_p2/data/03_primary/v4_B/0__1__2__3__4__5__6__7__8__9__10__11__12__13__14__15__16__20__21__POST__22__23__17
11718


In [91]:
final_file = OUTPUT_FILE
with open(final_file, 'r') as f:
    final_text = f.readlines()
    
print(final_text[0:5])

['ew mineral deposit models cordillera eofile 1996 01 collection abstracts presentation 1996 cordilleran roundup short course section prepared different set authors table contents introduction making models matter mike etheridge etheridge henley williams b british columbia mineral deposit profiles david lefebure british columbia geological survey old skarns carlin type deposits characteristics gold skarns gerry ray british columbia geological survey d recent refractory gold discoveries carlin trend nevada david groves newmont exploration limited e carlin type gold deposits canadian potential howard poulsen geological survey canada sediment hosted mineralization sediment hosted stratiform copper rod kirkham geological survey canada g sedex pb zn deposits creating framework understanding hydrothermal alteration exploration guide bob turner geological survey canada h genesis carbonaceous shale hosted ni mo pge deposits wayne goodfellow geological survey canada helmut geldsetzer geological

In [92]:
print('Number of lines')
print(len(final_text))

Number of lines
11718


In [93]:
print('Number of \n\n (should be new documents)')
print(len([i for i in range(len(final_text) -1) if final_text[i] == "\n" and final_text[i+1] == '\n']))

Number of 

 (should be new documents)
0


## Vocab

In [94]:
final_text_full = ''.join(final_text)

final_text_df = pd.DataFrame({'words': final_text_full.split()})

display(final_text_df)

Unnamed: 0,words
0,ew
1,mineral
2,deposit
3,models
4,cordillera
...,...
76989508,states
76989509,information
76989510,reproduction
76989511,eip


In [95]:
vc = final_text_df.words.value_counts()

In [96]:
vc

1              663133
lake           518590
area           474317
2              407501
ontario        389163
                ...  
gemmizi             1
gatadmetall         1
meniku              1
bodies1             1
jar7cpir            1
Name: words, Length: 806886, dtype: int64

In [97]:
vc.describe(percentiles=[0,0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9,0.95,0.99,1.0])

count    806886.000000
mean         95.415601
std        2461.143994
min           1.000000
0%            1.000000
10%           1.000000
20%           1.000000
25%           1.000000
30%           1.000000
40%           1.000000
50%           1.000000
60%           2.000000
70%           2.000000
75%           3.000000
80%           4.000000
90%          12.000000
95%          45.000000
99%         899.150000
100%     663133.000000
max      663133.000000
Name: words, dtype: float64

In [98]:
display(vc[vc > 1].shape)
display(vc[vc > 2].shape)
display(vc[vc > 3].shape)
display(vc[vc > 4].shape)
display(vc[vc > 5].shape)

(363149,)

(215263,)

(175255,)

(146668,)

(130699,)