In [1]:
import pandas as pd
from os import listdir,environ,rename 
from sys import argv
from os.path import isfile,join,basename
from shutil import rmtree,move
import os
from zipfile import ZipFile
import glob

In [2]:
root_dir = "/nrcan_p2"
data_dir = join(root_dir,"data")
geoscan_files_dir=join(data_dir,"01_raw","20201006","geoscan")
zip_dir=join(geoscan_files_dir,"raw","zip")
extracted_dir = join(data_dir,"01_raw","20201117","geoscan","raw","extracted")

In [753]:
%%bash
## rm -rf /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/*
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/of_pdf
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/has_pdf_dir
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/generic_pdfs
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/txt
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/wp_rtf
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/low_text_pdfs
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/tmp

In [3]:
def get_all_zip_files():
    return [f for f in listdir(zip_dir) if isfile(join(zip_dir, f))]

def get_processed_zips():
    extracted_files_paths = glob.glob(extracted_dir+"/**/processed.txt")
    extracted_files = []
    for path in extracted_files_paths:
        data = ""
        with open(path,"r") as f:
            data = f.read()
            extracted_files += data.replace(zip_dir+"/","").split("\n")
            

    return list(filter(None,extracted_files))

def get_geoid(zip_filename):
    return zip_filename.split(".")[0]

def is_zip_dir(filename):
    if filename:
        return (filename[-1] == "/")
    return False

def get_unprocessed_files():
    all_zips = get_all_zip_files()
    processed_zips = get_processed_zips()
    return [x for x in all_zips if x not in processed_zips]

def to_gb(size_in_bytes):
    return size_in_bytes / 1073741824

def print_mb_as_gbs(size_in_mbytes):
    return f'{size_in_mbytes/1024:.0f} GB'

def get_extension(filename):
    if len(filename.split(".")) > 1:
        return filename.split(".")[-1]
    return None

def is_pdf(filename):
    return (get_extension(filename) == "pdf")

def is_txt(filename):
    return (get_extension(filename) == "txt")

def is_rtf(filename):
    return (get_extension(filename) == "rtf")

def is_wp(filename):
    return (get_extension(filename) == "wp")

def is_htm(filename):
    return ((get_extension(filename) == "htm") or (get_extension(filename) == "html"))

def does_has_of_pdf(filename):
    filename = filename.lower()
    if (is_pdf(filename)) and (filename.startswith("of_")):
        filename_no_ext = (filename.split(".")[0])
        if (len(filename_no_ext.split("_"))==2) and(filename_no_ext.split("_")[1].isnumeric()):
            # mean the format is of_<number>.pdf which would be the report
             return True
    return False

def get_stats(extracted_dir_name,msg="PDFs"):
    nb_zips_with_files_extracted = 0
    nb_empty_zips = 0
    with open(join(extracted_dir,extracted_dir_name,"extracted.txt"),"r+") as f:
        lines = f.readlines()
        for line in lines:
            file_and_path = line.strip().split(":")
            if line[1] == '':
                nb_empty_zips +=1
            else:
                nb_zips_with_files_extracted +=1
            
    print(f'Number of zips respecting selection criteria and having {msg}:{nb_zips_with_files_extracted}')
    print(f'Number of zips respecting selection criteria and not having {msg}:{nb_empty_zips}')

class ZipProcessor:
    PROCESSED_FILENAME = "processed.txt"
    IGNORED_FILENAME = "ignored.txt"
    EXTRACTED_FILENAME = "extracted.txt"
    
    def __init__(self,extraction_dir,zip_file_path):
        self.extraction_dir = extraction_dir
        self.zip_file_path = zip_file_path
        self.extracted_files = []
        self.ignored_files = []
        # We assume our files are in format <geoscanid>.zip
        self.geoscanid = basename(zip_file_path).split(".")[0]
    
    def extract_file(self,filename):
        with ZipFile(self.zip_file_path, 'r') as zip_file:
            zip_file.extract(filename,self.extraction_dir)
            if len(fileparts := filename.split("/")) > 1:
                move(join(self.extraction_dir,filename),join(self.extraction_dir,f'{self.geoscanid}_{fileparts[-1]}'))
                rmtree(join(self.extraction_dir,fileparts[0]))
            else:
                rename(join(self.extraction_dir,filename),join(self.extraction_dir,f'{self.geoscanid}_{filename}'))
            self.extracted_files.append(filename)
    
    def ignore_file(self,filename):
        self.ignored_files.append(filename)

    def finish_processing(self):
        with open(join(self.extraction_dir,self.PROCESSED_FILENAME),"a") as f:
            f.write(f'{self.zip_file_path}\n')

        with open(join(self.extraction_dir,self.EXTRACTED_FILENAME),"a") as f:
            f.write(f'{self.zip_file_path}:{",".join(self.extracted_files)}\n')

        with open(join(self.extraction_dir,self.IGNORED_FILENAME),"a") as f:
            f.write(f'{self.zip_file_path}:{",".join(self.ignored_files)}\n')
    
    def has_extracted(self):
        if self.extracted_files:
            return True
        return False

## Extraction rules
All extracted files will be prefixed geoid_in_zip_file_name. All extracted files could be found in **extracted.txt** file, all ignored files in **ignored.txt** and all processed zips in **processed.txt**

Here is the extraction chain. Every files is extracted only once for a given rule,starting with the first one.
French files are filtered by removing any files that contain one of the following strings: **"\_fr.","\_fr\_","-fr\_","\_fr-","-fr-","french"**

### 1. Extract **of_number.pdf** files from zips
Extract **of_number.pdf** files from zips (if they contain them). Ignore the other files. Put the files in of_pdf directory.

### 2. Files containing PDF directory
The files are put in has_pdf_dir directory.For files containing PDF directory, extract the pdfs in PDF directory and ignore the rest. The following pdfs are ignored given their content is badly adapted for text search (read tables) or doesnt contain useful text (read maps). If a file contains one of the following words, it will not be extracted. Excluded words: **"front","statistics","index","bibliography","table_of_contents","author","contents","cover","foreword","plate","preface","table","figure","fig","graph","map","line","appendix","links","locations","count","data","legend","homepage","_tab","_f0","_as_","location","articles","title"**

There is also custom files we decided to ignore for specific geoids (Theses files contain maps, but not text records). These files  geoid are:
**213391,224262,211485"**

The following file geoids had their main file extracted manually: **221215,223383,223386,224581,222149,222774,248223,247843,220364,134069,205729,215375,216684,215739,215877,216570,224263,224797,224807,226367,247526**

### 3. Other files containing at least one PDF
The files are put in generic_pdfs directory.Given the difference of structure of files, more complex rules were needed in order to extract useful files. If a file contains one of the following words, it will not be extracted: **"front","statistics","index","bibliography","table_of_contents","author","contents","cover","foreword","plate","preface","table","figure","fig","graph","map","line","appendix","links","legend","homepage","sheet","chart","_mn0","_mn1","plan","cross","data","overlay","_nt0","_xs0","_xs1","_xs2","_xs3","_fg0","_fg1","_fg2","_fg3","acrobat","readme","pictou","coulor","reader","licen","start","lisez","carte","disclaimer","_tab_","db_schema","_colour","acknowledgment","trademark","gscmcm_","_mcm"**. 

There is also custom files we decided to ignore for specific geoids (Theses files contain maps, but not text records). These files geoid are:
**"100377","109277","119940","123533","127153","127162","127165","127166","127167","127168", "127169","127170","127171","127172","127173","127174","127175","127176","127177","127178","127179","127180","127181","127182","127183","127184","127185","127186","127187","127188","127189","127190","127191","127192","127193","127194","127195","127196","127197","127198","127199","127200","127201","127202","127203","127204","127205","127206","127207","127208","129093","129094","129099","129142","129143","129148","129151","129175","129183","129210","129211","129212","129213","129232","129250","129273","129347","129447","129463","129471","129686","129733","130006","130030","130437","130570","130912","132668","183851","208238","208241","209370","210616","210617","210627","210637","211515","212607","214638","215455","222822","224834","247421","247421","247424","247425","247678","285489","285569","286078","286185","286262","287133","287847","290255","8315"**

The folowing files were extracted manually (since there were maps which were harder to remove with generic rules):
**"100357_of_129_AccountLakeSed.pdf","100358_of_0133_Tome1_texte.pdf","100379_of_0471_part_1.pdf","100379_of_0471_part_2.pdf","100500_m_26.pdf","100506_me_32.pdf","100518_me_296.pdf","100547_me_320.pdf","100795_m_385.pdf","100797_me_165.pdf","100794_me_163.pdf","100808_me_176.pdf","100849_me_147.pdf","101153_pa_38-21.pdf","101157_pa_38-16.pdf","101560_me_68.pdf","101569_m_77.pdf","101637_m_211.pdf","101683_m_121.pdf","101793_wp_322.pdf","102157_bu_306.pdf","102307_of_0040_Report.pdf","102433_pa_71-19.pdf","102592_Paper_75-27.pdf","102612_pa_75_41.pdf","102624_pa_76-17.pdf","102634_pa_76-29.pdf","103299_m_372.pdf","104332_bu_276.pdf","108390_pa_51_11.pdf","108440_pa_50_8.pdf","119605_of_1081_Volume1.pdf","119605_of_1081_Volume2.pdf","119605_of_1081_Volume3.pdf","119739_pa_83-31.pdf","119943_me_77_f.pdf","119943_me_77_f.pdf","120589_pa_84-11.pdf","120602_pa_85-16.pdf","123565_rop_1863_Atlas_eng.pdf","123575_rop_1866-69_french_mono.pdf","123889_rop_1866-69_french_part i.pdf","123890_rop_1866-69_french_part ii.pdf","128139_of_2026.pdf","129132_of_0094Report.pdf","129178_of_0116_Report.pdf","129287_of_0487_part_1.pdf","129287_of_0487_part_2.pdf","129333_of_0222_vol1.pdf","129333_of_0222_vol2.pdf","129333_of_0222_vol3.pdf","129408_of_0504_1975_subsea_cable_route_studies.pdf","129408_of_0504_report_on_a_brief_search_for_data.pdf","129470_OF0381BOOK.pdf","129471_of_0382_part2_magnetic_tape_users_manual.pdf","129477_of_0389-ps110.pdf","129477_of_0389-ps111.pdf","129477_of_0389-ps112.pdf","129477_of_0389-ps113.pdf","129477_of_0389-ps114.pdf","129477_of_0389-ps115.pdf","129477_of_0389-ps116.pdf","129477_of_0389-ps201.pdf","129477_of_0389-ps202.pdf","129477_of_0389-ps203.pdf","129477_of_0389-ps204.pdf","129477_of_0389-ps205.pdf","129477_of_0389-ps206.pdf","129477_of_0389-ps207.pdf","129506_of_0605_Part1.pdf","129506_of_0605_Part2.pdf","129511_of_0522a.pdf","129512_of_0522b.pdf","129859_of_0978_vol1.pdf","129859_of_0978_vol2.pdf","129897_of_1116_report.pdf","130263_of_1358.pdf","130281_of_1360.pdf","130282_of_1361.pdf","130283_of_1362.pdf","130284_of_1363.pdf","130285_of_1364.pdf","130761_of_1888_report.pdf","130761_of_1888_report_seismic.pdf","130761_of_1888_report_technical.pdf","130798_of_2110_v2.pdf","130451_of_1921_76E-11.pdf","130451_of_1921_76E-12.pdf","130451_of_1921_76E-13.pdf","130451_of_1921_76E-14.pdf","130483_Of_1638_(21G & 21H).pdf","130484_of_1641.pdf","130485_Of_1642_(74C & 74F).pdf","130592_OF1992BOOK.pdf","130773_OF 2125 vol 1.pdf","193364_OF2439BOOK.pdf","193494_OF2731BOOK.pdf","209907_PART1.PDF","209907_PART2.PDF","209907_PART3.PDF","209907_PART4.PDF","209907_PART5.PDF","209907_PART6.PDF","209907_PART7.PDF","209907_PART8.PDF","209907_PART9.PDF","209907_PART10.PDF","209974_PAPER.PDF","210074_bu_504.pdf","211376_Text.pdf","211434_bu_539.pdf","211641_bu_554.pdf","211793_Report.pdf","211804_of_3954-r.pdf","211874_bu_559.pdf","212098_OF3755_report.pdf","212711_Open File D2952.pdf","212842_OF4115.pdf","213037_saskatchewan.PDF","213996_Title_Page.pdf","210350_bu_498_gsc.pdf","214294_CPT_TEXT.PDF","214294_EM_TEXT.PDF","214294_GPR_TEXT.PDF","214294_INTRO_1.PDF","214294_INTRO_2.PDF","214294_INTRO_3.PDF","214294_PAPERS.PDF","214294_MODELS.PDF","214294_SEISMIC.PDF","214294_SFU_TEXT.PDF","214399_arcexplorer.pdf","214994_OF1670-s.pdf","215634_arcexplorer.pdf","215634_arcexplorer.pdf","221206_mr90_e.pdf","221526_INTRODUCTION.pdf","222773_of5327.pdf","222878_rop_1866-69_mono.pdf","224031_of5350.pdf","224968_Report 90-310.pdf","224968_of_2685.pdf","224968_of_2745.pdf","224968_of_2750.pdf","224968_of_2875.pdf","247630_ar_011_s.pdf","248232_SRreport.pdf","261330_OF5989_CCGS_Hudson_2008-029_cruise_report.pdf","263412_of5611.pdf","291751_Summary.pdf","291931_of_5487_15_App1.pdf","291931_of_5487_15_App2.pdf","291931_of_5487_15_App3.pdf","291931_of_5487_15_App4.pdf","292017_OF7106.pdf","292662_OpenFile7421.pdf","292800_OF7364.pdf","292870_of_7412_report.pdf","293760_Sum_Rep_1924_B.pdf","293877_bu_606_gsc.pdf","295079_bu_604_gsc.pdf","296502_bu_604_gsc.pdf","298718_PAPER.PDF","305397_of_0116Suppl_ReleaseNotice.pdf","307813_cmb_632.pdf","313749_of_0045_gc.pdf","8331_of_78_01.pdf","8747_do_gms_012_015.pdf","8748_do_gms_016_020.pdf"**

### 4. Zip dirs containing txt but no pdf files 
The files often reference CD media in another format which contain data and observations. We did not attempt to extract the data. 

### 5. WP and RTFs files
These files dont contain PDF or TXT files, but instead contain WP and RTFs files.

### 5. Unprocessed files (files which dont contain txt or pdf)
**'109528.zip','119487.zip','129399.zip','130457.zip','130911.zip','131318.zip','131699.zip','133405.zip','183968.zip','184082.zip','184150.zip','184214.zip','192437.zip','192442.zip','194063.zip','194079.zip','194080.zip','203270.zip','208314.zip','209916.zip','210093.zip','210113.zip','210377.zip','210902.zip','211290.zip','211693.zip','212642.zip','214521.zip','226533.zip','248120.zip','263390.zip','263391.zip','287420.zip','291819.zip','293108.zip','293154.zip','293658.zip','295695.zip','296405.zip','296406.zip','297628.zip','299666.zip','299667.zip','299668.zip','299728.zip','302765.zip','305337.zip','305363.zip','305827.zip','305828.zip'**

### ZIP dirs from which we failed to unzip PDFs:
**129399.zip,226672.zip 291819.zip **

In [6]:
len(get_all_zip_files())

4873

### 1. Extract **of_number.pdf**

In [724]:
zips_to_process = get_unprocessed_files()
for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"of_pdf"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            for filename in zip_file.namelist():
                if does_has_of_pdf(filename):
                    zip_processor.extract_file(filename)
                else:
                    zip_processor.ignore_file(filename)
        if zip_processor.has_extracted():
            zip_processor.finish_processing()
    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

Failed to extract file: of_7229.pdf for zip: 291819.zip


In [735]:
get_stats("of_pdf")

Number of zips respecting selection criteria and having PDFs:1159
Number of zips respecting selection criteria and not having PDFs:0


### 2. For files containing PDF directory

In [736]:
def custom_pdfdir_ignore(geoid,filename):
    """
    This function is used to ignore maps, tables and samples for specific GEO Ids
    """
    
    if geoid in ["213391","224262","211485"]:
        return True
    
    # samples
    if geoid == "221215" and (not "Intro4GSCShells.pdf" in filename):
        return True
    
    if geoid == "223383" and (not "ofcat.pdf" in filename):
        return True
    
    if geoid == "223386" and (not "of5442.pdf" in filename):
        return True
    
    if geoid == "224581" and (not "of5660.pdf" in filename):
        return True
    
    if geoid == "222149" and (not "of5088.pdf" in filename):
        return True

    if geoid == "222774" and (not "of5343.pdf" in filename):
        return True
    
    if geoid == "248223" and (not "of6274.pdf" in filename):
        return True

    if geoid == "247843" and (not "of5574.pdf" in filename):
        return True
    
    if geoid == "220364" and (not "of4887.pdf" in filename):
        return True
    
    # Extra 
    if geoid == "134069" and ("_errata" in filename):
        return True

    if geoid == "205729" and (not "bu_484.pdf" in filename):
        return True
    
    if (geoid in ["215375","216684"]) and (len(basename(filename)) > 8 ):
        return True
    
    if geoid == "215739" and (not "GSC_B575.pdf" in filename):
        return True
    
    if geoid == "215877" and (not "GSC_B577.pdf" in filename):
        return True
    
    if geoid == "216570" and (not "GSC_B582.pdf" in filename):
        return True
    
    if geoid == "224263" and (not "of5538.pdf" in filename):
        return True
    
    if geoid == "224797" and (not "B590.pdf" in filename):
        return True
    
    if geoid == "224807" and (not "b_533.pdf" in filename):
        return True
    
    if geoid == "226367" and (not "bu_595.pdf" in filename):
        return True
    
    if (geoid == "247526") and ((len(basename(filename)) > 10 ) and filename != "592_summary.pdf"):
        return True
    
    if geoid == "220345" and (not "GSC_B560.pdf" in filename):
        return True
    

In [737]:
filter_keywords = ["front","statistics","index","bibliography","table_of_contents","author","contents",
                   "cover","foreword","plate","preface","table","figure","fig","graph","map","line","appendix",
                   "links","locations","count","data","legend","homepage","_tab","_f0","as_","location",
                   "articles","title"]
french_keywords = ["_fr.","_fr_","-fr_","_fr-","-fr-","french"]
filter_keywords += french_keywords

zips_to_process = get_unprocessed_files()
for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"has_pdf_dir"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            # check first if it is a PDF DIR
            has_pdf_dir = False
            for filename in zip_file.namelist():
                if len(split_dir := filename.split("/")) > 1 and (split_dir[0].lower() == "pdf"):
                    # we have at lease one PDF dir so we can extract the data:
                    has_pdf_dir = True
                    break

            if has_pdf_dir:
                for filename in zip_file.namelist():
                    if is_zip_dir(filename) \
                        or (len(filename) < 4) \
                        or (not filename.lower().startswith("pdf")) \
                        or (not is_pdf(filename.lower())) \
                        or bool([ele for ele in filter_keywords if (ele in filename.lower())]) \
                        or custom_pdfdir_ignore(get_geoid(zip_filename),filename):
                        zip_processor.ignore_file(filename)
                    else:
                        zip_processor.extract_file(filename)
                zip_processor.finish_processing()

    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

In [738]:
get_stats("has_pdf_dir")

Number of zips respecting selection criteria and having PDFs:136
Number of zips respecting selection criteria and not having PDFs:0


### 3. Other files containing at least one PDF

In [739]:
filter_file_by_geoid = ["100357_of_129_AccountLakeSed.pdf",
                      "100358_of_0133_Tome1_texte.pdf",
                      "100379_of_0471_part_1.pdf",
                      "100379_of_0471_part_2.pdf",
                      "100500_m_26.pdf",
                      "100506_me_32.pdf",
                      "100518_me_296.pdf",
                      "100547_me_320.pdf",
                      "100795_m_385.pdf",
                        "100797_me_165.pdf",
                        "100794_me_163.pdf",
                        "100808_me_176.pdf",
                        "100849_me_147.pdf",
                        "101153_pa_38-21.pdf",
                        "101157_pa_38-16.pdf",
                        "101560_me_68.pdf",
                        "101569_m_77.pdf",
                        "101637_m_211.pdf",
                        "101683_m_121.pdf",
                        "101793_wp_322.pdf",
                        "102157_bu_306.pdf",
                        "102307_of_0040_Report.pdf",
                        "102433_pa_71-19.pdf",
                        "102592_Paper_75-27.pdf",
                        "102612_pa_75_41.pdf",
                        "102624_pa_76-17.pdf",
                        "102634_pa_76-29.pdf",
                        "103299_m_372.pdf",
                        "104332_bu_276.pdf",
                        "108390_pa_51_11.pdf",
                        "108440_pa_50_8.pdf",
                        "119605_of_1081_Volume1.pdf",
                        "119605_of_1081_Volume2.pdf",
                        "119605_of_1081_Volume3.pdf",
                        "119739_pa_83-31.pdf",
                        "119943_me_77_f.pdf",
                        "119943_me_77_f.pdf",
                        "120589_pa_84-11.pdf",
                        "120602_pa_85-16.pdf",
                        "123565_rop_1863_Atlas_eng.pdf",
                        "123575_rop_1866-69_french_mono.pdf",
                        "123889_rop_1866-69_french_part i.pdf",
                        "123890_rop_1866-69_french_part ii.pdf",
                        "128139_of_2026.pdf",
                        "129132_of_0094Report.pdf",
                        "129178_of_0116_Report.pdf",
                        "129287_of_0487_part_1.pdf",
                        "129287_of_0487_part_2.pdf",
                        "129333_of_0222_vol1.pdf",
                        "129333_of_0222_vol2.pdf",
                        "129333_of_0222_vol3.pdf",
                        "129408_of_0504_1975_subsea_cable_route_studies.pdf",
                        "129408_of_0504_report_on_a_brief_search_for_data.pdf",
                        "129470_OF0381BOOK.pdf",
                        "129471_of_0382_part2_magnetic_tape_users_manual.pdf",
                        "129477_of_0389-ps110.pdf",
                        "129477_of_0389-ps111.pdf",
                        "129477_of_0389-ps112.pdf",
                        "129477_of_0389-ps113.pdf",
                        "129477_of_0389-ps114.pdf",
                        "129477_of_0389-ps115.pdf",
                        "129477_of_0389-ps116.pdf",
                        "129477_of_0389-ps201.pdf",
                        "129477_of_0389-ps202.pdf",
                        "129477_of_0389-ps203.pdf",
                        "129477_of_0389-ps204.pdf",
                        "129477_of_0389-ps205.pdf",
                        "129477_of_0389-ps206.pdf",
                        "129477_of_0389-ps207.pdf",
                        "129506_of_0605_Part1.pdf",
                        "129506_of_0605_Part2.pdf",
                        "129511_of_0522a.pdf",
                        "129512_of_0522b.pdf",
                        "129859_of_0978_vol1.pdf",
                        "129859_of_0978_vol2.pdf",
                        "129897_of_1116_report.pdf",
                        "130263_of_1358.pdf",
                        "130281_of_1360.pdf",
                        "130282_of_1361.pdf",
                        "130283_of_1362.pdf",
                        "130284_of_1363.pdf",
                        "130285_of_1364.pdf",
                        "130761_of_1888_report.pdf",
                        "130761_of_1888_report_seismic.pdf",
                        "130761_of_1888_report_technical.pdf",
                        "130798_of_2110_v2.pdf",
                        "130451_of_1921_76E-11.pdf",
                        "130451_of_1921_76E-12.pdf",
                        "130451_of_1921_76E-13.pdf",
                        "130451_of_1921_76E-14.pdf",
                        "130483_Of_1638_(21G & 21H).pdf",
                        "130484_of_1641.pdf",
                        "130485_Of_1642_(74C & 74F).pdf",
                        "130592_OF1992BOOK.pdf",
                        "130773_OF 2125 vol 1.pdf",
                        "193364_OF2439BOOK.pdf",
                        "193494_OF2731BOOK.pdf",
                        "209907_PART1.PDF",
                        "209907_PART2.PDF",
                        "209907_PART3.PDF",
                        "209907_PART4.PDF",
                        "209907_PART5.PDF",
                        "209907_PART6.PDF",
                        "209907_PART7.PDF",
                        "209907_PART8.PDF",
                        "209907_PART9.PDF",
                        "209907_PART10.PDF",
                        "209974_PAPER.PDF",
                        "210074_bu_504.pdf",
                        "211376_Text.pdf",
                        "211434_bu_539.pdf",
                        "211641_bu_554.pdf",
                        "211793_Report.pdf",
                        "211804_of_3954-r.pdf",
                        "211874_bu_559.pdf",
                        "212098_OF3755_report.pdf",
                        "212711_Open File D2952.pdf",
                        "212842_OF4115.pdf",
                        "213037_saskatchewan.PDF",
                        "213996_Title_Page.pdf",
                        "210350_bu_498_gsc.pdf",
                        "214294_CPT_TEXT.PDF",
                        "214294_EM_TEXT.PDF",
                        "214294_GPR_TEXT.PDF",
                        "214294_INTRO_1.PDF",
                        "214294_INTRO_2.PDF",
                        "214294_INTRO_3.PDF",
                        "214294_PAPERS.PDF",
                        "214294_MODELS.PDF",
                        "214294_SEISMIC.PDF",
                        "214294_SFU_TEXT.PDF",
                        "214399_arcexplorer.pdf",
                        "214994_OF1670-s.pdf",
                        "215634_arcexplorer.pdf",
                        "215634_arcexplorer.pdf",
                        "221206_mr90_e.pdf",
                        "221526_INTRODUCTION.pdf",
                        "222773_of5327.pdf",
                        "222878_rop_1866-69_mono.pdf",
                        "224031_of5350.pdf",
                        "224968_Report 90-310.pdf",
                        "224968_of_2685.pdf",
                        "224968_of_2745.pdf",
                        "224968_of_2750.pdf",
                        "224968_of_2875.pdf",
                        "247630_ar_011_s.pdf",
                        "248232_SRreport.pdf",
                        "261330_OF5989_CCGS_Hudson_2008-029_cruise_report.pdf",
                        "263412_of5611.pdf",
                        "291751_Summary.pdf",
                        "291931_of_5487_15_App1.pdf",
                        "291931_of_5487_15_App2.pdf",
                        "291931_of_5487_15_App3.pdf",
                        "291931_of_5487_15_App4.pdf",
                        "292017_OF7106.pdf",
                        "292662_OpenFile7421.pdf",
                        "292800_OF7364.pdf",
                        "292870_of_7412_report.pdf",
                        "293760_Sum_Rep_1924_B.pdf",
                        "293877_bu_606_gsc.pdf",
                        "295079_bu_604_gsc.pdf",
                        "296502_bu_604_gsc.pdf",
                        "298718_PAPER.PDF",
                        "305397_of_0116Suppl_ReleaseNotice.pdf",
                        "307813_cmb_632.pdf",
                        "313749_of_0045_gc.pdf",
                        "8331_of_78_01.pdf",
                        "8747_do_gms_012_015.pdf",
                        "8748_do_gms_016_020.pdf",

                     ]

files_by_geoid = dict()
for file in filter_file_by_geoid:
    geoid = file.split("_")[0]
    base_file_name = file[len(geoid)+1:]
    if not geoid in files_by_geoid:
        files_by_geoid[geoid] = []
    files_by_geoid[geoid].append(base_file_name)

#224834 having french journal articles about earthquakes. Not sure we need that.

def custom_pdf_generic_ignore(geoid,filename):
    if geoid in ["100377","109277","119940","123533","127153","127162","127165","127166","127167","127168", 
                 "127169","127170","127171","127172","127173","127174","127175","127176","127177", 
                 "127178","127179","127180","127181","127182","127183","127184","127185","127186", 
                 "127187","127188","127189","127190","127191","127192","127193","127194","127195", 
                 "127196","127197","127198","127199","127200","127201","127202","127203","127204", 
                 "127205","127206","127207","127208","129093","129094","129099","129142","129143", 
                 "129148","129151","129175","129183","129210","129211","129212","129213","129232",
                 "129250","129273","129347","129447","129463","129471","129686","129733","130006",
                 "130030","130437","130570","130912","132668","183851","208238","208241","209370",
                 "210616","210617","210627","210637","211515","212607","214638","215455","222822",
                 "224834","247421","247421","247424","247425","247678","285489","285569","286078",
                 "286185","286262","287133","287847","290255","8315"
                ]:
        return True
    
    if geoid in files_by_geoid and not (basename(filename) in files_by_geoid[geoid]):
        return True
    
    return False

In [740]:
%%bash 
rm -rf /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/generic_pdfs/*

In [741]:
zips_to_process = get_unprocessed_files()
filter_keywords = ["front","statistics","index","bibliography","table_of_contents",
                   "author","contents","cover","foreword","plate","preface",
                   "table","figure","fig","graph","map","line","appendix",
                   "links","legend","homepage","sheet","chart","_mn0","_mn1","plan",
                   "cross","data","overlay","_nt0","_xs0","_xs1","_xs2","_xs3",
                   "_fg0","_fg1","_fg2","_fg3","acrobat","readme","pictou","coulor",
                   "reader","licen","start","lisez","carte","disclaimer","_tab_",
                   "db_schema","_colour","acknowledgment","trademark","gscmcm_","_mcm",]

french_keywords = ["_fr.","_fr_","-fr_","_fr-","-fr-","french"]
filter_keywords += french_keywords

for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"generic_pdfs"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            # check first if has a pdf
            has_pdf = False
            for filename in zip_file.namelist():
                if get_extension(filename.lower()) == "pdf":
                    has_pdf = True
                    break

            if has_pdf:
                for filename in zip_file.namelist():
                    if is_zip_dir(filename) \
                        or (not is_pdf(filename.lower())) \
                        or bool([ele for ele in filter_keywords if (ele in filename.lower())]) \
                        or custom_pdf_generic_ignore(get_geoid(zip_filename),filename):
                        zip_processor.ignore_file(filename)
                    else:
                        zip_processor.extract_file(filename)
                zip_processor.finish_processing()

    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

Failed to extract file: OF_283_Halifax_Harbour_Bottom_Survey_1974_Seismic_Reflection_Profiles.pdf for zip: 129399.zip
Failed to extract file: Earthquake-catalogue.pdf for zip: 226672.zip
Failed to extract file: of_7229.pdf for zip: 291819.zip


In [4]:
get_stats("generic_pdfs")

Number of zips respecting selection criteria and having PDFs:3439
Number of zips respecting selection criteria and not having PDFs:0


### 4. Zip dirs containing txt but no pdf files 

In [745]:
filter_file_by_geoid = ["205313_OF2867.TXT",
                        "205313_OF2868.TXT",
                        "205313_OF2974.TXT",
                        "208180_VOL-I.TXT",
                        "208180_VOL-II.TXT",
                        
                       ]

files_by_geoid = dict()
for file in filter_file_by_geoid:
    geoid = file.split("_")[0]
    base_file_name = file[len(geoid)+1:]
    if not geoid in files_by_geoid:
        files_by_geoid[geoid] = []
    files_by_geoid[geoid].append(base_file_name)
    

def custom_txt_generic_ignore(geoid,filename):
    if geoid in ["195142","203760","205765","208515","209895"]:
        return True
    
    if geoid in files_by_geoid and not (basename(filename) in files_by_geoid[geoid]):
        return True
    
    return False

In [746]:
%%bash 
rm -rf /nrcan_p2/data/01_raw/20201117/geoscan/raw/extracted/txt/*

In [747]:
zips_to_process = get_unprocessed_files()
filter_keywords = ["front","statistics","index","bibliography","table_of_contents",
                   "author","contents","cover","foreword","plate","preface",
                   "table","figure","fig","graph","map","line","appendix",
                   "links","legend","homepage","sheet","chart","_mn0","_mn1","plan",
                   "cross","data","overlay","_nt0","_xs0","_xs1","_xs2","_xs3",
                   "_fg0","_fg1","_fg2","_fg3","acrobat","readme","pictou","coulor",
                   "reader","licen","start","lisez","carte","disclaimer","_tab_",
                   "db_schema","_colour","acknowledgment","trademark","gscmcm_","_mcm",]

french_keywords = ["_fr.","_fr_","-fr_","_fr-","-fr-","french"]
filter_keywords += french_keywords
for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"txt"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            # check first if it has a txt
            has_txt = False
            for filename in zip_file.namelist():
                if get_extension(filename.lower()) == "txt":
                    has_txt = True
                    break

            if has_txt:
                for filename in zip_file.namelist():
                    if is_zip_dir(filename) \
                        or (not is_txt(filename.lower())) \
                        or bool([ele for ele in filter_keywords if (ele in filename.lower())]) \
                        or custom_txt_generic_ignore(get_geoid(zip_filename),filename):
                        zip_processor.ignore_file(filename)
                    else:
                        zip_processor.extract_file(filename)
                zip_processor.finish_processing()

    except Exception as err:
        print(f'Error: {err}. Failed to extract file: {filename} for zip: {zip_filename}')

Error: That compression method is not supported. Failed to extract file: ECW/list.txt for zip: 291819.zip


In [748]:
get_stats("txt","txt")

Number of zips respecting selection criteria and having txt:79
Number of zips respecting selection criteria and not having txt:0


### 5. WP and RTFs files  

In [749]:
zips_to_process = get_unprocessed_files()
for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"wp_rtf"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            for filename in zip_file.namelist():
                if is_zip_dir(filename) or (not (is_wp(filename.lower()) or is_rtf(filename.lower()))):
                    zip_processor.ignore_file(filename)
                else:
                    zip_processor.extract_file(filename)
            if zip_processor.has_extracted():
                zip_processor.finish_processing()
    except Exception as err:
        print(f'Error: {err}. Failed to extract file: {filename} for zip: {zip_filename}')

Error: That compression method is not supported. Failed to extract file: Readme.rtf for zip: 291819.zip


In [750]:
get_stats("wp_rtf","WP and RTF files")

Number of zips respecting selection criteria and having WP and RTF files:10
Number of zips respecting selection criteria and not having WP and RTF files:0


### 6. Unprocessed files (files which dont contain txt or pdf)

In [751]:
len(get_unprocessed_files())

50

In [752]:
get_unprocessed_files()

['109528.zip',
 '119487.zip',
 '129399.zip',
 '130457.zip',
 '130911.zip',
 '131318.zip',
 '131699.zip',
 '133405.zip',
 '183968.zip',
 '184082.zip',
 '184150.zip',
 '184214.zip',
 '192437.zip',
 '192442.zip',
 '194063.zip',
 '194079.zip',
 '194080.zip',
 '203270.zip',
 '208314.zip',
 '209916.zip',
 '210093.zip',
 '210113.zip',
 '210377.zip',
 '210902.zip',
 '211290.zip',
 '211693.zip',
 '212642.zip',
 '214521.zip',
 '226533.zip',
 '248120.zip',
 '263390.zip',
 '263391.zip',
 '287420.zip',
 '291819.zip',
 '293108.zip',
 '293154.zip',
 '293658.zip',
 '295695.zip',
 '296405.zip',
 '296406.zip',
 '297628.zip',
 '299666.zip',
 '299667.zip',
 '299668.zip',
 '299728.zip',
 '302765.zip',
 '305337.zip',
 '305363.zip',
 '305827.zip',
 '305828.zip']

### 7. Extracting maps, crossections and other combinations of maps and textual data

In [None]:
zips_to_process = get_all_zip_files()
filters_to_keep = ["front","plate","figure","fig","graph","map","line","appendix",
                   "legend","sheet","chart","_mn0","_mn1","plan",
                   "cross","overlay","_nt0","_xs0","_xs1","_xs2","_xs3",
                   "_fg0","_fg1","_fg2","_fg3","gscmcm_","_mcm",]

for zip_filename in zips_to_process:
    try:
        zip_processor = ZipProcessor(join(extracted_dir,"low_text_pdfs"),join(zip_dir,zip_filename))
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            for filename in zip_file.namelist():
                lfile_name= filename.lower()
                has_pdf = False
                for filename in zip_file.namelist():
                    if get_extension(lfile_name) == "pdf":
                        has_pdf = True
                        break
            
                if has_pdf:
                    for filename in zip_file.namelist():
                        if not is_pdf(filename) or \
                            not bool([ele for ele in filters_to_keep if (ele in filename.lower())]):
                            zip_processor.ignore_file(filename)
                        else:
                            zip_processor.extract_file(filename)
                    
                    if zip_processor.has_extracted():
                        zip_processor.finish_processing()
                        
    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

Failed to extract file: OF_283_Bedford_Basin_Lines_305_to_end_of_line.pdf for zip: 129399.zip


In [5]:
get_stats("low_text_pdfs","Extra pdfs with low text")

Number of zips respecting selection criteria and having Extra pdfs with low text:27321
Number of zips respecting selection criteria and not having Extra pdfs with low text:0


In [None]:
%%bash
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/tmp

### Detecting french files

In [698]:
%%bash
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/tmp
rm -rf /nrcan_p2/data/01_raw/20201117/geoscan/raw/tmp/*

In [699]:
tmp_raw="/nrcan_p2/data/01_raw/20201117/geoscan/raw/tmp"
zips_to_process = get_all_zip_files()
count = 0

for zip_filename in zips_to_process:
    try:
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            # check first if has a pdf
            has_pdf = False
            for filename in zip_file.namelist():
                if get_extension(filename.lower()) == "pdf":
                    has_pdf = True
                    break

            if has_pdf:
                
    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

### Analysing extra files

In [683]:
%%bash
mkdir -p /nrcan_p2/data/01_raw/20201117/geoscan/raw/extra_tests
rm -rf /nrcan_p2/data/01_raw/20201117/geoscan/raw/extra_tests/*

In [684]:
filter_keywords = ["front","statistics","index","bibliography","table_of_contents",
                   "author","contents","cover","foreword","plate","preface",
                   "table","figure","fig","graph","map","line","appendix",
                   "links","legend","homepage","sheet","chart","_mn0","_mn1","plan",
                   "cross","data","overlay","_nt0","_xs0","_xs1","_xs2","_xs3",
                   "_fg0","_fg1","_fg2","_fg3","acrobat","readme","pictou","coulor",
                   "reader","licen","start","lisez","carte","disclaimer","_tab_",
                   "db_schema","_colour","acknowledgment","trademark","gscmcm_","_mcm",]

In [685]:
count_by_keyword = dict()
for keyword in filter_keywords:
    count_by_keyword[keyword] =0
    MAX_KEYWORD_COUNT=5

In [686]:
extra_tests="/nrcan_p2/data/01_raw/20201117/geoscan/raw/extra_tests"

In [687]:
zips_to_process = get_all_zip_files()
for zip_filename in zips_to_process:
    try:
        with ZipFile(join(zip_dir,zip_filename), 'r') as zip_file:
            already_used = False
            for filename in zip_file.namelist():
                if already_used:
                    break
                lfile_name= filename.lower()
                if is_pdf(lfile_name):
                    for keyword in filter_keywords:
                        if (keyword in lfile_name) and (count_by_keyword[keyword] < MAX_KEYWORD_COUNT):
                            count_by_keyword[keyword] +=1
                            zip_file.extract(filename,extra_tests)
                            geoscan_id =zip_filename.split("_")[0]
                            if len(fileparts := filename.split("/")) > 1:
                                move(join(extra_tests,filename),join(extra_tests,f'{keyword}_{geoscan_id}_{fileparts[-1]}'))
                                rmtree(join(extra_tests,fileparts[0]))
                            else:
                                rename(join(extra_tests,filename),join(extra_tests,f'{keyword}_{geoscan_id}_{filename}'))
                            
                            already_used = True
                            break
                
    except Exception as err:
        print(f'Failed to extract file: {filename} for zip: {zip_filename}')

Failed to extract file: OF_283_Bedford_Basin_Lines_328_to_end_of_line_Time_Start_1848_.pdf for zip: 129399.zip
