In [3]:
import pathlib
import tarfile
import pandas as pd
import os

from tqdm.notebook import tqdm
from time import time
from shutil import rmtree
from queue import Queue
from zipfile import ZipFile
from pyunpack import Archive
from collections import defaultdict

paths = {
    'c04': pathlib.Path('/datalake/ufmg/c04'),
    'tmp-c04': pathlib.Path('/datalake/ufmg/c04/tmp'),
}

In [2]:
# List of files from file: 
# https://docs.google.com/spreadsheets/d/1-YzTYRMJwRM9V6pTPB44CYHkxxsmxyVmnqQPvSmFC6A/edit?usp=sharing
compressed_files = [
    '35-consulta-a-pregoes.zip',
    '48-licitacoes-bh.zip',
    '51-licitacoes-uberlandia.zip',
    '69-licitacoes-uberaba-2012-2020.zip',
    '72-licitacoes-governador_valadares.zip',
    '75-licitacoes-ipatinga.zip',
    '77-diario-oficial-possos-de-caldas.zip',
    '81-licitacoes-varginha.zip',
    '83-licitacoes-monte_alegre.zip',
    '87-licitacoes-itabirito.zip',
    '288_licitacoes-pirapetinga.tar.xz',
    '289-licitacoes-coqueiral.zip',
    '290-licitacoes-cristais.zip',
    '302-licitacoes-olaria.7z',
    '304-licitacoes-passa-vinte.tar.gz',
    '306-licitacoes-arantina.7z',
    '351-licitacoes-itamarati.zip',
    '353-licitacoes-ijaci.7z',
    '381-licitacoes-sao-bento-abade.zip'
]

# List of components to extract during the analysis
files_info = [
    'Tamanho comprimido',
    'Tamanho descomprimido',
    'Subpastas',
    'Arquivos',
    'Tamanho médio arquivos',
    'pdf',
    'doc',
    'xls',
    'csv',
    'txt',
    'Outros'
]

In [3]:
def decompress_file(tmp_path, full_filepath, filename, extension):
    '''
    Decompress a file in a specified directory.
    '''
    
    if extension == '.zip':
        file_obj = ZipFile(full_filepath, 'r')
    elif tarfile.is_tarfile(full_filepath):
        file_obj = tarfile.open(full_filepath, 'r')
    elif extension == '.7z':
        file_obj = Archive(full_filepath)
    elif not extension:
        print('Nothing to do, file uncompressed')
        return None
    
    try:
        file_obj.extractall(tmp_path)
    except(FileNotFoundError):
        pass

In [4]:
def get_subdirs_info(path):
    '''
    Walk through every subdirectory of the path provided
    in BFS fashion, retrieving all useful information and
    storing it in dictionaries.
    
    Returns:
        subdirs_info (dict): count of subdirs, files and average file size
        file_types_count (dict): count of unique file extensions
    '''
    
    subdirs_info = defaultdict(int)
    
    # Defining suffixes of interest
    suffixes = ['.pdf', '.doc', '.xls', '.csv', '.txt']
    file_types_count = defaultdict(int)
    
    q = Queue()
    q.put(path)
    
    while not q.empty():
        curpath = q.get()
        
        for p in os.scandir(curpath):
            if os.path.isdir(p):
                subdirs_info['subdirs_count'] += 1
                q.put(p)
            else:
                subdirs_info['file_count'] += 1
                subdirs_info['average_file_size'] += os.path.getsize(p)
                
                # Adding file suffix to known extensions
                suffix = os.path.splitext(p)[1]
                
                # Treating the redundant 'pdf' extensions
                if suffix.startswith('.pdf'):
                    file_types_count['.pdf'] += 1
                else:
                    file_types_count[suffix] += 1
    
    # Getting the average file size
    subdirs_info['average_file_size'] /= subdirs_info['file_count']
    
    return subdirs_info, file_types_count

In [5]:
def get_dir_size(start_path = '.'):
    """
    Walks through the specified directory and
    calculates the total size.
    
    Returns:
        total_size (int): size of the directory in bytes
    """
    
    total_size = 0
    
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # Skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

In [7]:
df_info = {}
df_files_count = {}

for file in tqdm(compressed_files):
    file_info = {}
    
    # Defining a path to the compressed file
    path = os.path.join(paths['c04'], file)
    # Defining a temporary path to extract the file
    tmp_path = os.path.join(paths['c04'], 'tmp'+str(time())[:8])
    os.mkdir(tmp_path)
    
    # Decompressing
    extension = pathlib.Path(path).suffix
    decompress_file(tmp_path, path, file, extension)
    
    # Saving all the info
    file_info['size'] = os.path.getsize(path)/1000
    file_info['decompressed_size'] = get_dir_size(tmp_path)/1000
    
    subdirs_info, file_types_count = get_subdirs_info(tmp_path)
    # Adding new found extensions
    df_files_count.update(file_types_count)
    
    file_info['subdirs_count'] = subdirs_info['subdirs_count']
    file_info['file_count'] = subdirs_info['file_count']
    file_info['average_file_size'] = subdirs_info['average_file_size']/1000
    
    file_info.update(file_types_count)
    df_info[file] = file_info
    
    # Deleting the uncompressed file after we're done
    rmtree(tmp_path)

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))




In [15]:
# Naming unkown extensions
df_files_count['others'] = df_files_count.pop('')

df_ext = pd.DataFrame.from_dict(df_files_count, columns=['count'], orient='index')
df_ext

Unnamed: 0,count
.pdf,232
.html,1
.zip,23
.txt,4032
.docx,1
.xls,10
.xlsx,2
.doc,1
.PDF,10
.info,657


In [20]:
df_files = pd.DataFrame.from_dict(df_info, orient='index')

df_files.fillna(0, inplace=True)
df_files['others'] = df_files.pop('')

# Changing size units to megabytes
df_files['size'] /= 1000
df_files['decompressed_size'] /= 1000
df_files['average_file_size'] /= 1000

df_files

Unnamed: 0,size,decompressed_size,subdirs_count,file_count,average_file_size,.pdf,.html,.zip,.txt,.docx,...,.XLSX,.rar,.rtf,.jpg,.jpeg,.out,.err,.jsonl,.csv,others
35-consulta-a-pregoes.zip,10812.060354,22470.01364,48419,208108,0.107973,111474.0,96634.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48-licitacoes-bh.zip,36107.350035,37743.23567,4511,19216,1.964157,9312.0,4360.0,1016.0,4032.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51-licitacoes-uberlandia.zip,3352.805911,3942.61262,1317,4024,0.979775,2340.0,0.0,102.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,655.0
69-licitacoes-uberaba-2012-2020.zip,16629.356453,18804.223878,11,17916,1.049577,16558.0,547.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72-licitacoes-governador_valadares.zip,16795.913684,17519.536848,2085,6162,2.843158,38.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1057.0
75-licitacoes-ipatinga.zip,8085.320633,8917.738552,5283,11070,0.805577,6.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2643.0
77-diario-oficial-possos-de-caldas.zip,28.729664,145.291082,3883,16062,0.009046,0.0,16059.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
81-licitacoes-varginha.zip,1260.077081,12.4026,23,40,0.310065,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
83-licitacoes-monte_alegre.zip,784.601131,871.237828,1317,3503,0.248712,2185.0,0.0,0.0,0.0,1.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,657.0
87-licitacoes-itabirito.zip,1312.3616,1441.131087,578,2169,0.664422,1601.0,0.0,1.0,0.0,10.0,...,0.0,1.0,59.0,2.0,1.0,0.0,0.0,0.0,0.0,56.0


In [21]:
df_ext.to_excel('files_ext_count.xlsx')
df_files.to_excel('files_info.xlsx')