In [7]:
"""
Modern applications usually involve large chunks of data, which iss specially true for machine learning models

https://github.com/datablist/sample-csv-files

"""
import bz2
import _pickle as cPickle
import pickle
import pandas

def plain_load(file):
    with open(file, "rb") as f:
      return cPickle.load(f)

def pandas_load(file):
    df = pandas.read_csv(file)
    return df

def plain_dump(file, data):
    with open(file, "wb") as f:
      return cPickle.dump(data,f)

def compress_pickle_b2z(file, data):
    with bz2.BZ2File(file, 'w') as f:
        cPickle.dump(data, f)

def decompress_pickle_b2z(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [2]:
"""
But here we'll be picking a relatively small file of about 16MB

https://media.githubusercontent.com/media/datablist/sample-csv-files/main/files/customers/customers-100000.csv

The content doesn't really matter for the purpose of this tutorial
"""

import requests

url = 'https://media.githubusercontent.com/media/datablist/sample-csv-files/main/files/customers/customers-100000.csv'

response = requests.get(url)

with open('../data/customers-100000.csv', 'wb') as file:
    file.write(response.content)





In [6]:
"""
Using regular pickle by itself already guarantees some sort of compression

In the case of this particular CSV file, compressed size was ~8.5% smaller than the original

loading times also improved from ~560ms to ~173ms
"""

%timeit pandas_load('../data/customers-100000.csv')

plain_dump('../data/customers-100000.pickle', pandas_load('../data/customers-100000.csv'))

import os

original_size = os.path.getsize("../data/customers-100000.csv")

pickle_size = os.path.getsize("../data/customers-100000.pickle")

compression_ratio = 1 - pickle_size/original_size

print(f'original_size = {original_size} - pickle_size = {pickle_size} - compression_ratio = { compression_ratio:.2%}')

%timeit plain_load('../data/customers-100000.pickle')




589 ms ± 28.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
original_size = 17317579 - pickle_size = 15833716 - compression_ratio = 8.57%
157 ms ± 2.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
"""
The dataframe can be loaded from the pickle, and behave just the same.

In fact, that's the whole point of serialization. :)

"""

df_from_pickle = plain_load('../data/customers-100000.pickle')
df_from_pickle.head()

'\nThe dataframe can be loaded from the pickle, and behave just the same.\n\n\n'

In [9]:
"""
We can get even better compression, at expense of loading time which went from 589ms to 1.1s (almost doubled)

In this example the compression rate for b2z made the file ~70% smaller than original (compared to ~8.5% with pickle only, without any compression)

There is a tradeoff between file size and load times, and in the next notebook, we'll see that B2Z is not our only option
"""

df = plain_load('../data/customers-100000.pickle')

compress_pickle_b2z("../data/customers-100000.pbz2", df)

%timeit decompress_pickle_b2z("../data/customers-100000.pbz2")

original_size = os.path.getsize("../data/customers-100000.csv")

pickle_size = os.path.getsize("../data/customers-100000.pickle")

compressed_b2z_pickle_size = os.path.getsize("../data/customers-100000.pbz2")

pickle_compression_ratio = 1 - pickle_size/original_size

compressed_pickle_compression_ratio = 1 - compressed_b2z_pickle_size/original_size

print(f'original_size = {original_size} - pickle_size = {pickle_size} - pickle_compression_ratio = { pickle_compression_ratio:.2%}')

print(f'original_size = {original_size} - compressed_b2z_pickle_size = {compressed_b2z_pickle_size} - compressed_pickle_compression_ratio = { compressed_pickle_compression_ratio:.2%}')


1.1 s ± 86.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
original_size = 17317579 - pickle_size = 15833716 - pickle_compression_ratio = 8.57%
original_size = 17317579 - compressed_b2z_pickle_size = 4672996 - compressed_pickle_compression_ratio = 73.02%
