In [37]:
import glob
from os.path import join, exists, getsize
from isatools import isatab
from sys import getsizeof
import logging
from collections import deque
import pandas as pd
import numpy as np

def total_size(o, handlers={}, verbose=False):
    """ Returns the approximate memory footprint an object and all of its contents.

    Automatically finds the contents of the following builtin containers and
    their subclasses:  tuple, list, deque, dict, set and frozenset.
    To search other containers, add handlers to iterate over their contents:

        handlers = {SomeContainerClass: iter,
                    OtherContainerClass: OtherContainerClass.get_elements}

    """
    logger = logging.getLogger(__name__)
    all_handlers = {
        tuple: iter,
        list: iter,
        deque: iter,
        dict: lambda d: chain.from_iterable(d.items()),
        set: iter,
        frozenset: iter,
    }
    all_handlers.update(handlers)
    seen = set()
    default_size = getsizeof(0)

    def sizeof(o):
        if id(o) in seen:       # do not double count the same object
            return 0
        seen.add(id(o))
        s = getsizeof(o, default_size)

        if verbose:
            logger.info(s, type(o), repr(o), file=stderr)

        for typ, handler in all_handlers.items():
            if isinstance(o, typ):
                s += sum(map(sizeof, handler(o)))
                break
        return s

    return sizeof(o)

In [29]:
isa_synthetic_dir = "/Users/davidjohnson/Development/isa_memory_study3/data/synthetic/"
isa_synthetic_list = [isatab.load(i) for i in glob.iglob(isa_synthetic_dir + "*")]

In [32]:
print("fname,disk_size,df_size,isa_size,study_id")
for isa_synthetic in isa_synthetic_list:
    study_id = isa_synthetic.identifier
    for study in isa_synthetic.studies:
        fname = study.filename
        study_file_path = join(isa_synthetic_dir, isa_synthetic.identifier, study.filename)
        disk_size = getsize(study_file_path)
        isa_size = total_size(study.process_sequence)
        df_size = total_size(pd.read_csv(study_file_path))
        print("{},{},{},{},{}".format(fname, disk_size, df_size, isa_size, study_id))
        for assay in study.assays:
            fname = assay.filename
            assay_file_path = join(isa_synthetic_dir, isa_synthetic.identifier, assay.filename)
            disk_size = getsize(assay_file_path)
            isa_size = total_size(assay.process_sequence)
            df_size = total_size(pd.read_csv(assay_file_path))
            print("{},{},{},{},{}".format(fname, disk_size, df_size, isa_size, study_id))

fname,disk_size,df_size,isa_size,study_id
study.txt,8976,11075,2712,synthetic-isa-1sample-6assays
a_blood_ms_FIA_positive_assay.txt,14646,16704,5312,synthetic-isa-1sample-6assays
a_blood_ms_FIA_negative_assay.txt,14486,16544,5312,synthetic-isa-1sample-6assays
a_blood_ms_LC_positive_assay.txt,18561,21974,5312,synthetic-isa-1sample-6assays
a_blood_ms_LC_negative_assay.txt,18561,21974,5312,synthetic-isa-1sample-6assays
a_blood_ms_GC_positive_assay.txt,18561,21974,5312,synthetic-isa-1sample-6assays
a_blood_ms_GC_negative_assay.txt,18561,21974,5312,synthetic-isa-1sample-6assays
study.txt,26597,33176,7912,synthetic-isa-3samples-1assay
a_sweat_ms_GC_negative_assay.txt,18841,22254,5312,synthetic-isa-3samples-1assay
a_blood_ms_GC_negative_assay.txt,18801,22214,5312,synthetic-isa-3samples-1assay
a_tears_ms_GC_negative_assay.txt,18841,22254,5312,synthetic-isa-3samples-1assay
study.txt,11974,13675,2712,synthetic-isa-1sample-1assay
a_blood_ms_GC_negative_assay.txt,21770,24814,5312,synthetic-isa-1sa

In [49]:
df = pd.read_csv("synthetic-memory_footprint.csv")
# put the records by category to help with downstream processing
disk_sizes = []
obj_sizes = []
categories = []
study_ids = []

for _, row in df.iterrows():
    disk_sizes.append(row.disk_size)
    obj_sizes.append(row.df_size)
    study_ids.append(row.study_id)
    categories.append('DataFrame')
    disk_sizes.append(row.disk_size)
    obj_sizes.append(row.isa_size)
    study_ids.append(row.study_id)
    categories.append('ISA')
# What is this?? Messes up loop
#    if '_s' in row.study_id:
#        categories.append('CisaFrame')
#    elif '_a' in row.study_id:
#        categories.append('CisaFrame') 
     
df_by_cat = pd.DataFrame({
    'disk_size': disk_sizes,
    'size': obj_sizes,
    'log_size': np.log(obj_sizes),
    'category': categories,
    'study_id' : study_ids
})

df_sorted = df_by_cat.sort_values(by='category')
df_sorted.to_csv("synthetic-memory_by_category.csv", index=None)
df_sorted

Unnamed: 0,disk_size,size,log_size,category,study_id
0,8976,11075,9.312446,DataFrame,synthetic-isa-1sample-6assays
34,18561,21974,9.997615,DataFrame,synthetic-isa-3samples-6assays
36,18561,21974,9.997615,DataFrame,synthetic-isa-3samples-6assays
38,18561,21974,9.997615,DataFrame,synthetic-isa-3samples-6assays
40,14886,16944,9.737669,DataFrame,synthetic-isa-3samples-6assays
84,18841,22254,10.010277,DataFrame,synthetic-isa-2samples-1assay
44,14726,16784,9.728181,DataFrame,synthetic-isa-3samples-6assays
46,14566,16624,9.718603,DataFrame,synthetic-isa-3samples-6assays
48,14726,16784,9.728181,DataFrame,synthetic-isa-3samples-6assays
50,14726,16784,9.728181,DataFrame,synthetic-isa-3samples-6assays
