In [3]:
import pandas as pd
from os import listdir,environ
from sys import argv
from os.path import isfile,join
import os
import zipfile

In [2]:
root_dir = "/nrcan_p2"
data_dir = join(root_dir,"data")
geoscan_files_dir=join(data_dir,"01_raw","20201006","geoscan")
raw_dir=join(geoscan_files_dir,"raw")

In [4]:
onlyfiles = [f for f in listdir(raw_dir) if isfile(join(raw_dir, f))]

In [5]:
def to_gb(size_in_bytes):
    return size_in_bytes / 1073741824
def print_mb_as_gbs(size_in_mbytes):
    return f'{size_in_mbytes/1024:.0f} GB'


In [120]:
files_by_size = dict()
files_by_size["filename"] = []
files_by_size["type"] = []
files_by_size["size_mb"] = []
files_by_size["nb_files"] = []
files_by_size["contained_extensions"] = []
files_by_size["has_root_dir"] = []
files_by_size["txt_count"] = []
files_by_size["pdf_count"] = []
files_by_size["wp_count"] = []
files_by_size["has_pdf_dir"] = []

for item in onlyfiles:
    file_type = item.split(".")[-1]
    if file_type == "zip":
        files_by_size["filename"].append(item)
        files_by_size["type"].append(file_type)
        files_by_size["size_mb"].append(os.path.getsize(join(raw_dir,item))/1024/1024)
        zip_file=zipfile.ZipFile(join(raw_dir,item))
        min_len = 1000000
        nb_files = 0
        ext_count = {}
        has_pdf_dir = False
        for path in zip_file.namelist():
            nb_files+=1
            split_path = path.split("/")
            min_len = min(min_len,len(split_path))
            extension = "none"
            if split_path[0].lower() == 'pdf':
                has_pdf_dir = True
            if len(split_path[-1].split(".")) > 1:
                extension = split_path[-1].split(".")[-1].lower()
            
            ext_count[extension] = ext_count[extension]+1 if extension in ext_count else 1
                
        files_by_size["has_pdf_dir"].append(has_pdf_dir)
        files_by_size["nb_files"].append(nb_files)
        files_by_size["txt_count"].append(ext_count["txt"] if "txt" in ext_count else 0)
        files_by_size["pdf_count"].append(ext_count["pdf"] if "pdf" in ext_count else 0)
        files_by_size["wp_count"].append(ext_count["wp"] if "wp" in ext_count else 0)
        files_by_size["has_root_dir"].append(True if min_len>1 else False)
        files_by_size["contained_extensions"].append(list(ext_count.keys()))   

df= pd.DataFrame.from_dict(files_by_size)
zip_df = df[df["type"] == "zip"].copy()

### Stats
* 4873 zip files
* 802 contains text files from these
    * 724 contains txt and pdf
        * From these 136 contain a PDF folder regrouping pdfs (My understanding one article), while 1 did not contain text
    * 78 contains only text and no pdf
* 4013 contain pdfs only and no text
    * 3623 contains more then 1 pdf
    * 390 contains only 1 pdf
* 70 files contain more then 100 pdfs 

In [94]:
zip_df.shape[0]

4873

In [95]:
zip_df[(zip_df["txt_count"] > 0)].shape[0]

802

In [88]:
zip_df[(zip_df["txt_count"] > 0) & (zip_df["pdf_count"] > 0)].shape[0]

724

In [134]:
zip_df[(zip_df["has_pdf_dir"] == True) &(zip_df["txt_count"] > 0)].shape[0]

136

In [135]:
zip_df[(zip_df["has_pdf_dir"] == True) &(zip_df["txt_count"] == 0)].shape[0]

1

In [89]:
zip_df[(zip_df["txt_count"] > 0) & (zip_df["pdf_count"] == 0)].shape[0]

78

In [149]:
zip_df[((zip_df["pdf_count"] == 0) & (zip_df["txt_count"] == 0) ) & (zip_df["wp_count"] > 0)].shape[0]

0

In [99]:
zip_df[(zip_df["txt_count"] == 0) & (zip_df["pdf_count"] > 0)].shape[0]

4013

In [100]:
zip_df[(zip_df["txt_count"] == 0) & (zip_df["pdf_count"] > 1)].shape[0]

3623

In [101]:
zip_df[(zip_df["txt_count"] == 0) & (zip_df["pdf_count"] == 1)].shape[0]

390

In [114]:
zip_df[zip_df["pdf_count"] > 100].shape[0]

70

In [69]:
df.describe()

Unnamed: 0,size_mb,nb_files,txt_count,pdf_count
count,4873.0,4873.0,4873.0,4873.0
mean,119.022828,108.7677,5.34291,11.654012
std,491.811752,953.723604,155.231344,115.349882
min,0.005001,1.0,0.0,0.0
25%,10.565957,2.0,0.0,2.0
50%,22.976173,3.0,0.0,2.0
75%,58.592632,13.0,0.0,5.0
max,9968.041075,24975.0,10106.0,6702.0


### Examples

#### Text and pdf: 724 contains txt and pdf

In [108]:
zip_df[(zip_df["txt_count"] > 0) & (zip_df["pdf_count"] > 0)].head()

Unnamed: 0,filename,type,size_mb,nb_files,contained_extensions,has_root_dir,txt_count,pdf_count,wp_count
8,8338.zip,zip,8.110393,2,"[pdf, txt]",False,1,1,0
80,90183.zip,zip,3.946605,11,"[none, wp, pdf, xls, txt]",True,1,1,4
1311,102460.zip,zip,345.484759,38,"[txt, none, bin, exe, idx, log, pdx, pdf, bmp,...",False,8,14,0
1502,105577.zip,zip,168.612314,26,"[ini, inf, txt, dmg, exe, pdf, bmp, ico]",False,8,11,0
1509,106037.zip,zip,73.121432,75,"[txt, none, bin, exe, idx, pdx, pdf, bmp, ico,...",False,8,52,0


In [109]:
zip_file=zipfile.ZipFile(join(raw_dir,'8338.zip'))
zip_file.namelist()

['epbof_78_06_fr.pdf', 'epbof_78_06_fr_readme.txt']

In [137]:
zip_file=zipfile.ZipFile(join(raw_dir,'90183.zip'))
zip_file.namelist()

['OF 2899/BLACKM55',
 'OF 2899/BLACKM55.WP',
 'OF 2899/MALLAO18',
 'OF 2899/MALLAO18.WP',
 'OF 2899/OF 2899 text_graphs.pdf',
 'OF 2899/OFR',
 'OF 2899/OFR2899.xls',
 'OF 2899/PORCUG31',
 'OF 2899/PORCUG31.WP',
 'OF 2899/README.TXT',
 'OF 2899/README.WP']

In [115]:
zip_file=zipfile.ZipFile(join(raw_dir,'106037.zip'))
zip_file.namelist()

['readme.txt',
 'APPS/MAC/',
 'APPS/MAC/AcroReader51_ENU.bin',
 'APPS/PC/',
 'APPS/PC/AcroReader51_ENU_full.exe',
 'APPS/',
 'PDF/eg_31/',
 'PDF/eg_31/index.idx',
 'PDF/eg_31/index1.idx',
 'PDF/eg_31.pdx',
 'PDF/eg_31_01.pdf',
 'PDF/eg_31_02.pdf',
 'PDF/eg_31_03.pdf',
 'PDF/eg_31_04.pdf',
 'PDF/eg_31_05.pdf',
 'PDF/eg_31_06.pdf',
 'PDF/eg_31_07.pdf',
 'PDF/eg_31_08.pdf',
 'PDF/eg_31_09.pdf',
 'PDF/eg_31_10a.pdf',
 'PDF/eg_31_10b.pdf',
 'PDF/eg_31_10c.pdf',
 'PDF/eg_31_11.pdf',
 'PDF/eg_31_12.pdf',
 'PDF/eg_31_13.pdf',
 'PDF/eg_31_14.pdf',
 'PDF/eg_31_15.pdf',
 'PDF/eg_31_16.pdf',
 'PDF/eg_31_17.pdf',
 'PDF/eg_31_18a.pdf',
 'PDF/eg_31_18b.pdf',
 'PDF/eg_31_19.pdf',
 'PDF/eg_31_20.pdf',
 'PDF/eg_31_21a.pdf',
 'PDF/eg_31_21b.pdf',
 'PDF/eg_31_22.pdf',
 'PDF/eg_31_23.pdf',
 'PDF/eg_31_24.pdf',
 'PDF/eg_31_25.pdf',
 'PDF/eg_31_26.pdf',
 'PDF/eg_31_27.pdf',
 'PDF/eg_31_28.pdf',
 'PDF/eg_31_29.pdf',
 'PDF/eg_31_30.pdf',
 'PDF/eg_31_31.pdf',
 'PDF/eg_31_32.pdf',
 'PDF/eg_31_33.pdf',
 'PDF/eg_3

#### Text and pdf: 78 contains txt and pdf

In [102]:
zip_df[(zip_df["txt_count"] > 0) & (zip_df["pdf_count"] == 0)].head()

Unnamed: 0,filename,type,size_mb,nb_files,contained_extensions,has_root_dir,txt_count,pdf_count,wp_count
0,407.zip,zip,0.381922,18,"[none, wp, xls, txt]",True,2,0,3
2617,130913.zip,zip,0.248765,3,[txt],False,3,0,0
2674,133235.zip,zip,0.350342,5,"[txt, exe, rme, dat]",False,2,0,0
2692,133472.zip,zip,58.592632,88,"[sum, dat, dxf, none, cdr, shd, geo, pl, ntx, ...",False,2,0,0
2784,192430.zip,zip,0.272566,8,"[f, doc, ps, txt, mdl, none, cmn]",False,1,0,0


In [106]:
zip_file=zipfile.ZipFile(join(raw_dir,'407.zip'))
zip_file.namelist()

['OF 2916/ELBOW7W5',
 'OF 2916/FGAP9W5',
 'OF 2916/HARMA3W5',
 'OF 2916/HUNTE9W5',
 'OF 2916/INDEX',
 'OF 2916/JUMPI5W5',
 'OF 2916/OF2916_TEXT.WP',
 'OF 2916/OFR2916.xls',
 'OF 2916/OFRSW.TXT',
 'OF 2916/PANT10W5',
 'OF 2916/partialtext',
 'OF 2916/README.TXT',
 'OF 2916/README.WP',
 'OF 2916/SHEEP3W5',
 'OF 2916/STIMS4W5',
 'OF 2916/SULLI5W5',
 'OF 2916/TITLE.WP',
 'OF 2916/TURNE2W5']

In [105]:
zip_file=zipfile.ZipFile(join(raw_dir,'130913.zip'))
zip_file.namelist()

['CHEM.TXT', 'FIELD.TXT', 'INTRO.TXT']

In [107]:
zip_file=zipfile.ZipFile(join(raw_dir,'133472.zip'))
zip_file.namelist()

['sum/bsba_ss.sum',
 'sum/bsbc_sw.sum',
 'sum/bsbt_sw.sum',
 'sum/bsom_ss.sum',
 'sum/bsom_sw.sum',
 'sum/bspe_ss.sum',
 'sum/bspe_sw.sum',
 'sum/bssc_ss.sum',
 'sum/bstj_ss.sum',
 'sum/bswy_ss.sum',
 'dat/bsba_ss.dat',
 'dat/bsbc_sw.dat',
 'dat/bsbt_sw.dat',
 'dat/bsom_ss.dat',
 'dat/bsom_sw.dat',
 'dat/bspe_ss.dat',
 'dat/bspe_sw.dat',
 'dat/bssc_ss.dat',
 'dat/bstj_ss.dat',
 'dat/bswy_ss.dat',
 'dxf/bsba_ss.dxf',
 'dxf/bsbc_sw.dxf',
 'dxf/bsbt_sw.dxf',
 'dxf/bsom_ss.dxf',
 'dxf/bsom_sw.dxf',
 'dxf/bspe_ss.dxf',
 'dxf/bspe_sw.dxf',
 'dxf/bssc_ss.dxf',
 'dxf/bstj_ss.dxf',
 'dxf/bswy_ss.dxf',
 'dxf/NOTE_DXF',
 'dxf_symbolized/bsba_ss-s.dxf',
 'dxf_symbolized/bsbc_sw-s.dxf',
 'dxf_symbolized/bsom_ss-s.dxf',
 'dxf_symbolized/bsom_sw-s.dxf',
 'dxf_symbolized/bspe_ss-s.cdr',
 'dxf_symbolized/bspe_ss-s.dxf',
 'dxf_symbolized/bspe_sw-s.cdr',
 'dxf_symbolized/bspe_sw-s.dxf',
 'dxf_symbolized/bssc_ss-s.dxf',
 'dxf_symbolized/bstj_ss-s.dxf',
 'dxf_symbolized/bswy_ss-s.dxf',
 'etc/hatch.shd',
 '

#### 3623 Contains more then 1 pdf and no text

In [151]:
zip_df[(zip_df["txt_count"] == 0) & (zip_df["pdf_count"] > 1)].head()

Unnamed: 0,filename,type,size_mb,nb_files,contained_extensions,has_root_dir,txt_count,pdf_count,wp_count,has_pdf_dir
1,8288.zip,zip,18.371377,2,[pdf],False,0,2,0,False
2,8315.zip,zip,5.093999,5,[pdf],False,0,5,0,False
3,8320.zip,zip,16.99741,9,[pdf],False,0,9,0,False
4,8331.zip,zip,11.165783,12,[pdf],False,0,12,0,False
5,8332.zip,zip,57.183168,33,[pdf],False,0,33,0,False


In [152]:
zip_file=zipfile.ZipFile(join(raw_dir,'8315.zip'))
zip_file.namelist()

['epb_gms_157.pdf',
 'epb_gms_158.pdf',
 'epb_gms_159.pdf',
 'epb_gms_160.pdf',
 'epb_gms_161.pdf']

In [153]:
zip_file=zipfile.ZipFile(join(raw_dir,'8332.zip'))
zip_file.namelist()

['epbof_78-04_e_1978_fg01.pdf',
 'epbof_78-04_e_1978_fg02.pdf',
 'epbof_78-04_e_1978_fg03.pdf',
 'epbof_78-04_e_1978_fg04.pdf',
 'epbof_78-04_e_1978_fg04b.pdf',
 'epbof_78-04_e_1978_fg05.pdf',
 'epbof_78-04_e_1978_fg06.pdf',
 'epbof_78-04_e_1978_fg07.pdf',
 'epbof_78-04_e_1978_fg08.pdf',
 'epbof_78-04_e_1978_fg09.pdf',
 'epbof_78-04_e_1978_fg10.pdf',
 'epbof_78-04_e_1978_fg10b.pdf',
 'epbof_78-04_e_1978_fg11.pdf',
 'epbof_78-04_e_1978_fg12.pdf',
 'epbof_78-04_e_1978_fg13.pdf',
 'epbof_78-04_e_1978_fg14.pdf',
 'epbof_78-04_e_1978_fg15.pdf',
 'epbof_78-04_e_1978_fg16.pdf',
 'epbof_78-04_e_1978_fg17.pdf',
 'epbof_78-04_e_1978_fg18.pdf',
 'epbof_78-04_e_1978_fg19.pdf',
 'epbof_78-04_e_1978_fg20.pdf',
 'epbof_78-04_e_1978_fg21.pdf',
 'epbof_78-04_e_1978_fg22.pdf',
 'epbof_78-04_e_1978_fg23.pdf',
 'epbof_78-04_e_1978_fg24.pdf',
 'epbof_78-04_e_1978_fg25.pdf',
 'epbof_78-04_e_1978_fg26.pdf',
 'epbof_78-04_e_1978_fg27.pdf',
 'epbof_78-04_e_1978_fg28.pdf',
 'epbof_78-04_e_1978_fg29.pdf',
 'epbo

#### 390 contains only 1 pdf

In [154]:
zip_df[(zip_df["txt_count"] == 0) & (zip_df["pdf_count"] == 1)].head()

Unnamed: 0,filename,type,size_mb,nb_files,contained_extensions,has_root_dir,txt_count,pdf_count,wp_count,has_pdf_dir
1356,102564.zip,zip,15.179568,2,"[pdf, tif]",False,0,1,0,False
1821,120017.zip,zip,4769.487562,10,"[mp4, pdf, rtf]",False,0,1,0,False
2057,128140.zip,zip,10.991287,7,"[pdf, asc, hdr, ngr, dos, exe]",False,0,1,0,False
2060,128150.zip,zip,0.511469,11,"[c, d, pdf, exe, for, none]",False,0,1,0,False
2062,128154.zip,zip,3.373104,5,"[log, pdf]",False,0,1,0,False


In [155]:
zip_file=zipfile.ZipFile(join(raw_dir,'102564.zip'))
zip_file.namelist()

['pa_74-61.pdf', 'pa_74-61_fig_2.tif']

In [156]:
zip_file=zipfile.ZipFile(join(raw_dir,'120017.zip'))
zip_file.namelist()

['OF1079 T1-Introduction.mp4',
 'OF1079 T2-Ellesmere I.-King Edward Pt to South Cape area.mp4',
 'OF1079 T3-Ellesmere I.-South Cape to Cape Donninghausen.mp4',
 'OF1079 T4-N. Kent I-Falk Pt to Cape Burgoyne, Devon I-Cape Derby to Cape Arundell.mp4',
 'OF1079 T5-Devon I.-Cape Arundell to Head of Thomas Lee Inlet.mp4',
 'OF1079 T6-Devon I.-Head of Thomas Lee Inlet to Sverdrup Inlet area.mp4',
 'OF1079 T7-Devon I.-Brae Bay area to Cape Fitzroy.mp4',
 'OF1079 T8-Coburg I..mp4',
 'of_1079.pdf',
 'readme_video_OF1079.rtf']

In [162]:
zip_file=zipfile.ZipFile(join(raw_dir,'128154.zip'))
zip_file.namelist()

['ICE90C3.LOG', 'ICE90D3.LOG', 'of_2273.pdf', 'ICE90A3.LOG', 'ICE90B3.LOG']

#### 70 files contain more then 100 pdfs 

In [159]:
zip_df[zip_df["pdf_count"] > 100].head()

Unnamed: 0,filename,type,size_mb,nb_files,contained_extensions,has_root_dir,txt_count,pdf_count,wp_count,has_pdf_dir
2182,129399.zip,zip,1944.482334,128,[pdf],False,0,128,0,False
3019,209907.zip,zip,287.280622,1470,"[none, pdf, wld, abt, cat, ddd, did, pdd, plc,...",False,15,654,0,False
3020,209908.zip,zip,416.651575,145,"[pdf, db]",False,0,144,0,False
3025,209922.zip,zip,1016.084459,133,"[pdf, db]",False,0,132,0,False
3152,211803.zip,zip,30.311357,152,"[xls, pdf, txt]",False,1,130,0,False


In [161]:
zip_file=zipfile.ZipFile(join(raw_dir,'129399.zip'))
zip_file.namelist()

['OF_283_Bedford_Basin_Lines_304_to_end_of_line_Time_1536_to_1555.pdf',
 'OF_283_Bedford_Basin_Lines_305_to_end_of_line.pdf',
 'OF_283_Bedford_Basin_Lines_306_to_end_of_line_Time_1459.pdf',
 'OF_283_Bedford_Basin_Lines_307_to_end_of_line_Time_1038_to_1047.pdf',
 'OF_283_Bedford_Basin_Lines_308_to_end_of_line_Time_end_1058.pdf',
 'OF_283_Bedford_Basin_Lines_309_to_end_of_line_Time_end_1110.pdf',
 'OF_283_Bedford_Basin_Lines_310_to_end_of_line_Time_end_1120.pdf',
 'OF_283_Bedford_Basin_Lines_311_to_end_of_line_Time_end_1131.pdf',
 'OF_283_Bedford_Basin_Lines_312_to_end_of_line.pdf',
 'OF_283_Bedford_Basin_Lines_313_to_end_of_line_Time_end_1453.pdf',
 'OF_283_Bedford_Basin_Lines_314_to_end_of_line_Time_end_1440.pdf',
 'OF_283_Bedford_Basin_Lines_315_to_end_of_line_Time_1414_to_1424.pdf',
 'OF_283_Bedford_Basin_Lines_316_to_end_of_line_Time_end_1412.pdf',
 'OF_283_Bedford_Basin_Lines_317_to_end_of_line_Time_1540_to_1527.pdf',
 'OF_283_Bedford_Basin_Lines_318_to_end_of_line_Time_end_1539.pd

In [160]:
zip_file=zipfile.ZipFile(join(raw_dir,'211803.zip'))
zip_file.namelist()

['Appendix 7/Appendix 7 profiles.xls',
 'Appendix 7/Appendix 7 regional.xls',
 'Appendix 7/Appendix 7 title.xls',
 'Appendix 10/Appendix 10 tables.pdf',
 'Appendix 10/Appendix 10 title.PDF',
 'Appendix 11/Al2 map.pdf',
 'Appendix 11/Al2 stats.pdf',
 'Appendix 11/Al63 map.pdf',
 'Appendix 11/Al63 stats.pdf',
 'Appendix 11/Appendix 11 Title.PDF',
 'Appendix 11/As2 map.pdf',
 'Appendix 11/As2 stats.pdf',
 'Appendix 11/As63 map.pdf',
 'Appendix 11/As63 stats.pdf',
 'Appendix 11/Au63 map.pdf',
 'Appendix 11/Au63 stats.pdf',
 'Appendix 11/Ba2 map.pdf',
 'Appendix 11/Ba2 stats.pdf',
 'Appendix 11/Ba63 map.pdf',
 'Appendix 11/Ba63 stats.pdf',
 'Appendix 11/Ca2 map.pdf',
 'Appendix 11/Ca2 stats.pdf',
 'Appendix 11/Ca63 map.pdf',
 'Appendix 11/Ca63 stats.pdf',
 'Appendix 11/Co2 map.pdf',
 'Appendix 11/Co2 stats.pdf',
 'Appendix 11/Co63 map.pdf',
 'Appendix 11/Co63 stats.pdf',
 'Appendix 11/Cr2 map.pdf',
 'Appendix 11/Cr2 stats.pdf',
 'Appendix 11/Cr63 map.pdf',
 'Appendix 11/Cr63 stats.pdf',
 'A

In [164]:
%%bash
df -Th

Filesystem                                                                                                                                               Type      Size  Used Avail Use% Mounted on
overlay                                                                                                                                                  overlay   1.8T 1004G  667G  61% /
tmpfs                                                                                                                                                    tmpfs      64M     0   64M   0% /dev
tmpfs                                                                                                                                                    tmpfs     252G     0  252G   0% /sys/fs/cgroup
/dev/mapper/vg00-root                                                                                                                                    ext4      439G   12G  405G   3% /opt/borgy
/dev/sdc2                      