In [1]:
from tables import *
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import time
from memory_profiler import memory_usage

Usecase: sum over rows

Pandas no modfication

In [8]:
def pandas_pure():
    
    start = time.time()
    data = pd.read_csv("testdata/table.tsv",sep="\t", index_col=0)
    row_sum = data.sum(axis=1)
    end = time.time()
    print("time: "+str(end-start)+"s")
    del data
    del row_sum
    del start
    del end
    
mem = memory_usage(pandas_pure)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

  mask |= (ar1 == a)


time: 3.394268035888672s
Allo. RAM Peak: 1218.25390625 Mb


Pandas iterator

In [3]:
def pandas_iterate():
    
    start = time.time()
    data = pd.read_csv("testdata/table.tsv",sep="\t", index_col=0, chunksize=1000)
    row_sum = [chunk.sum(axis=1) for chunk in data]
    row_sum = [x for y in row_sum for x in y]
    end = time.time()
    print("time: "+str(end-start)+"s")
    del data
    del row_sum
    del start
    del end
    
mem = memory_usage(pandas_iterate)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

time: 5.7114105224609375s
Allo. RAM Peak: 84.2578125 Mb


Pandas low memory mode

In [6]:
def pandas_lowmem():
    
    start = time.time()
    data = pd.read_csv("testdata/table.tsv",sep="\t", index_col=0, low_memory=True)
    row_sum = data.sum(axis=1)
    end = time.time()
    print("time: "+str(end-start)+"s")
    del data
    del row_sum
    del start
    del end
    
mem = memory_usage(pandas_lowmem)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

  mask |= (ar1 == a)


time: 3.4681665897369385s
Allo. RAM Peak: 1217.84765625 Mb


memory map

In [7]:
def pandas_memmap():
    
    start = time.time()
    data = pd.read_csv("testdata/table.tsv",sep="\t", index_col=0, memory_map=True)
    row_sum = data.sum(axis=1)
    end = time.time()
    print("time: "+str(end-start)+"s")
    del data
    del row_sum
    del start
    del end
    
mem = memory_usage(pandas_memmap)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

  mask |= (ar1 == a)


time: 3.2516770362854004s
Allo. RAM Peak: 1218.00390625 Mb


Parquet

In [4]:
def parquet_test():

    start = time.time()
    data = pq.read_table("testdata/table.parquet")
    row_sum = data.to_pandas().sum(axis=1)
    end = time.time()
    print("time: "+str(end-start)+"s")
    del data
    del row_sum
    del start
    del end
    
mem = memory_usage(parquet_test)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

time: 0.4632129669189453s
Allo. RAM Peak: 1310.91796875 Mb


pytables

In [5]:
def pytables_test():

    start = time.time()
    data = open_file("testdata/table.hdf5", mode="r")
    row_sum = [sum(x[1]) for x in data.root.testing.table]
    end = time.time()
    data.flush()
    print("time: "+str(end-start)+"s")
    del row_sum
    del data
    del start
    del end
    
mem = memory_usage(pytables_test)
print("Allo. RAM Peak: "+str(max(mem)-mem[0])+" Mb")

time: 8.021773099899292s
Allo. RAM Peak: 45.21484375 Mb
