In [1]:
import pathlib
import tarfile
import pandas as pd
import os
import time
import gc

from tqdm.notebook import tqdm
#from time import time
from shutil import rmtree
from queue import Queue
from zipfile import ZipFile
from pyunpack import Archive
from collections import defaultdict

paths = {
    'c04': pathlib.Path('../data/characterization/'),
    'tmp-c04': pathlib.Path('../data/characterization/'),
}

In [2]:
# List of files from file: 
# https://docs.google.com/spreadsheets/d/1-YzTYRMJwRM9V6pTPB44CYHkxxsmxyVmnqQPvSmFC6A/edit?usp=sharing
compressed_files = [
    '48-licitacoes-bh.zip',
    '51-licitacoes-uberlandia.zip',
    '69-licitacoes-uberaba-2012-2020.zip',
    '72-licitacoes-governador_valadares.zip',
    '75-licitacoes-ipatinga.zip',
    '77-diario-oficial-possos-de-caldas.zip',
    '81-licitacoes-varginha.zip',
    '83-licitacoes-monte_alegre.zip',
    '87-licitacoes-itabirito.zip',
    '288_licitacoes-pirapetinga.tar.xz',
    '289-licitacoes-coqueiral.zip',
    '290-licitacoes-cristais.zip',
    '302-licitacoes-olaria.7z',
    '304-licitacoes-passa-vinte.tar.gz',
    '306-licitacoes-arantina.7z',
    '351-licitacoes-itamarati.zip',
    '353-licitacoes-ijaci.7z',
    '381-licitacoes-sao-bento-abade.zip'
]

# List of components to extract during the analysis
files_info = [
    'Tamanho comprimido',
    'Tamanho descomprimido',
    'Subpastas',
    'Arquivos',
    'Tamanho médio arquivos',
    'pdf',
    'doc',
    'xls',
    'csv',
    'txt',
    'Outros'
]

In [3]:
def decompress_file(tmp_path, full_filepath, filename, extension):
    '''
    Decompress a file in a specified directory.
    '''
    
    if extension == '.zip':
        file_obj = ZipFile(full_filepath, 'r')
    elif tarfile.is_tarfile(full_filepath):
        file_obj = tarfile.open(full_filepath, 'r')
    elif extension == '.7z':
        file_obj = Archive(full_filepath)
    elif not extension:
        print('Nothing to do, file uncompressed')
        return None
    
    try:
        file_obj.extractall(tmp_path)
    except(FileNotFoundError):
        pass

In [4]:
def get_subdirs_info(path):
    '''
    Walk through every subdirectory of the path provided
    in BFS fashion, retrieving all useful information and
    storing it in dictionaries.
    
    Returns:
        subdirs_info (dict): count of subdirs, files and average file size
        file_types_count (dict): count of unique file extensions
    '''
    
    subdirs_info = defaultdict(int)
    
    # Defining suffixes of interest
    suffixes = ['.pdf', '.doc', '.xls', '.csv', '.txt']
    file_types_count = defaultdict(int)
    for path, subdirs, files in os.walk(path):
        subdirs_info['subdirs_count'] += len(subdirs)
        for name in files:
            subdirs_info['file_count'] += 1
            subdirs_info['average_file_size'] += os.path.getsize(os.path.join(path, name))
            #print(os.path.splitext(name))
            # Adding file suffix to known extensions
            suffix = os.path.splitext(name)[-1].lower()

            # Treating the redundant 'pdf' extensions
            if suffix.startswith('.pdf'):
                file_types_count['.pdf'] += 1
            else:
                file_types_count[suffix] += 1
            
    """
    q = Queue()    
    q.put(path)
    
    while not q.empty():
        curpath = q.get()
        #print(curpath)
        
        for p in os.scandir(curpath):
            if os.path.isdir(p):
                subdirs_info['subdirs_count'] += 1
                q.put(p)
                #print("dir: ",p)
            else:
                #print("not a dir: ",p)
                subdirs_info['file_count'] += 1
                subdirs_info['average_file_size'] += os.path.getsize(p)
                
                # Adding file suffix to known extensions
                suffix = os.path.splitext(p)[1].lower()
                
                # Treating the redundant 'pdf' extensions
                if suffix.startswith('.pdf'):
                    file_types_count['.pdf'] += 1
                else:
                    file_types_count[suffix] += 1
    """
    
    # Getting the average file size
    subdirs_info['average_file_size'] /= subdirs_info['file_count']
    
    return subdirs_info, file_types_count

In [5]:
def get_dir_size(start_path = '.'):
    """
    Walks through the specified directory and
    calculates the total size.
    
    Returns:
        total_size (int): size of the directory in bytes
    """
    
    total_size = 0
    
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # Skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

In [6]:
df_info = {}
df_files_count = {}

In [7]:
for file in tqdm(compressed_files):
    file_info = {}
    
    # Defining a path to the compressed file
    path = os.path.join(paths['c04'], file)
    # Defining a temporary path to extract the file
    tmp_path = os.path.join(paths['c04'], 'tmp'+str(time.time())[:8])
    os.mkdir(tmp_path)
    
    # Decompressing
    extension = pathlib.Path(path).suffix
    decompress_file(tmp_path, path, file, extension)
    time.sleep(5)
    
    # Saving all the info
    file_info['size'] = os.path.getsize(path)/1000
    file_info['decompressed_size'] = get_dir_size(tmp_path)/1000
    
    subdirs_info, file_types_count = get_subdirs_info(tmp_path)
    # Adding new found extensions
    df_files_count.update(file_types_count)
    
    file_info['subdirs_count'] = subdirs_info['subdirs_count']
    file_info['file_count'] = subdirs_info['file_count']
    file_info['average_file_size'] = subdirs_info['average_file_size']/1000
    
    file_info.update(file_types_count)
    df_info[file] = file_info
    
    # Deleting the uncompressed file after we're done
    rmtree(tmp_path)
    gc.collect()

# Naming unkown extensions
try:
    df_files_count['others'] = df_files_count.pop('')
except:
    pass

df_ext = pd.DataFrame.from_dict(df_files_count, columns=['count'], orient='index')

df_files = pd.DataFrame.from_dict(df_info, orient='index')

df_files.fillna(0, inplace=True)
df_files['others'] = df_files.pop('')

# Changing size units to megabytes
df_files['size'] /= 1000
df_files['decompressed_size'] /= 1000
df_files['average_file_size'] /= 1000

df_files

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




Unnamed: 0,size,decompressed_size,subdirs_count,file_count,average_file_size,.pdf,.html,.txt,.doc,.xlsx,...,.º 002 - 2018 - muro de arrimo - cemei,.º 002-2018 - fechamento no cemei professora adria,. - gás liquefeito de petróleo (diversas unidades),.q,.jpeg,.jsonl,.csv,.out,.err,others
48-licitacoes-bh.zip,36107.350035,37744.169178,4511,19217,1.964103,9363.0,4360.0,4032.0,84.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51-licitacoes-uberlandia.zip,3352.805911,3942.61262,1317,4024,0.979775,2340.0,0.0,0.0,257.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,655.0
69-licitacoes-uberaba-2012-2020.zip,16629.356453,18804.223878,11,17916,1.049577,16592.0,547.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72-licitacoes-governador_valadares.zip,16795.913684,17557.095331,2085,6171,2.845097,3823.0,0.0,0.0,39.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1057.0
75-licitacoes-ipatinga.zip,8085.320633,8917.834666,5283,11074,0.805295,5392.0,0.0,0.0,90.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2643.0
77-diario-oficial-possos-de-caldas.zip,28.729664,145.291082,3883,16062,0.009046,0.0,16059.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
81-licitacoes-varginha.zip,1260.077081,1487.17389,1253,3009,0.494242,1431.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,785.0
83-licitacoes-monte_alegre.zip,784.601131,871.237828,1317,3503,0.248712,2185.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,657.0
87-licitacoes-itabirito.zip,1312.3616,1443.54302,578,2170,0.665227,1611.0,0.0,0.0,119.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,56.0
288_licitacoes-pirapetinga.tar.xz,437.031884,632.098335,8,3413,0.185203,0.0,282.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2016.0,1.0,1.0,1108.0


In [8]:
df_ext.to_csv('files_ext_count_v2.csv')
df_files.to_csv('files_info_v2.csv')