In [1]:
"""
Let's use three popular algoritms

- [bzip2](https://docs.python.org/3/library/bz2.html)
- [gzip](https://docs.python.org/3/library/gzip.html)
- [blosc2](https://github.com/Blosc/python-blosc2)

Blosc is a high performance compressor optimized for binary data (i.e. floating point numbers, integers and booleans, although it can handle string data too). I

"""
import bz2, gzip, blosc2
import _pickle as cPickle
import pickle
import pandas

def plain_load(file):
    with open(file, "rb") as f:
      return cPickle.load(f)

def pandas_load(file):
    df = pandas.read_csv(file)
    return df

def plain_dump(file, data):
    with open(file, "wb") as f:
      return cPickle.dump(data,f)


def compress_pickle_b2z(file, data):
    with bz2.BZ2File(file, 'wb') as f:
        cPickle.dump(data, f)

def decompress_pickle_b2z(file):
    data = bz2.BZ2File(file, 'rb')
    return cPickle.load(data)


def decompress_pickle_gzip(file):
    with open(file, 'rb') as f:
        return cPickle.loads(gzip.decompress(f.read()))

def compress_pickle_gzip(file, data):
    with open(file, 'wb') as f:
        f.write(gzip.compress(cPickle.dumps(data)))


def decompress_pickle_blosc2(file):
    with open(file, 'rb') as f:
        return cPickle.loads(blosc2.decompress(f.read()))

def compress_pickle_blosc2(file, data):
    blosc2.MAX_BUFFERSIZE = 8 * 1024 * 1024 * 1024
    with open(file, 'wb') as f:
        f.write(blosc2.compress(cPickle.dumps(data)))



In [2]:
"""
Let's compress using our three formats

"""

df = plain_load('../data/customers-100000.pickle')

compress_pickle_b2z("../data/customers-100000.pbz2", df)
compress_pickle_gzip("../data/customers-100000.gzip", df)
compress_pickle_blosc2("../data/customers-100000.blosc2", df)




In [4]:
%timeit decompress_pickle_gzip("../data/customers-100000.gzip")
%timeit decompress_pickle_b2z("../data/customers-100000.pbz2")
%timeit decompress_pickle_blosc2("../data/customers-100000.blosc2")


307 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.27 s ± 30.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
174 ms ± 8.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
"""
any of the three files can be used to restore the pandas dataframe

"""
df = decompress_pickle_gzip("../data/customers-100000.gzip")
df.head()

In [None]:
"""
Let's compare file sizes, and compression rates

"""

import os

class PerformanceStats:
    def __init__(self, filename, method):
        self.method = method
        self.filename = filename
        self.original = 0.0
        self.compressed = 0.0
        self.ratio = 1

    def get_ratio(self):
        pickle_path = f"{self.filename}.pickle"
        file_path = f"{self.filename}.{self.method}"

        return os.path.getsize(file_path)/os.path.getsize(pickle_path)
    

import matplotlib.pyplot as plt

def plot_performance_stats(df):
    pivot_table = df.pivot_table(index='algorithm_name', values=['compression_ratio', 'execution_time'], aggfunc='mean')

    fig, ax1 = plt.subplots(figsize=(10, 5))

    pivot_table['compression_ratio'].plot(kind='bar', ax=ax1, color='b')
    ax1.set_ylabel('Compression Ratio', color='b')
    ax1.tick_params('y', colors='b')

    ax2 = ax1.twinx()

    pivot_table['execution_time'].plot(kind='line', ax=ax2, color='r', marker='o')
    ax2.set_ylabel('Execution Time (seconds)', color='r')
    ax2.tick_params('y', colors='r')

    ax1.set_xlabel('Algorithm Name')
    ax1.set_title('Compression Ratio and Execution Time by Algorithm')

    # show the plot
    plt.show()


data = {'compression_ratio': [
            PerformanceStats('../data/customers-100000', 'gzip').get_ratio(),
            PerformanceStats('../data/customers-100000', 'pbz2').get_ratio(), 
            PerformanceStats('../data/customers-100000', 'blosc2').get_ratio()],
        'execution_time': [307, 1270, 174],
        'algorithm_name': ['gzip', 'pbz2', 'blosc2']}

plot_performance_stats(pandas.DataFrame(data))




That was an interesting result. 

blosc2 had best loading time, but worse compression rate

pbz2 was twice as efficent for compression, but 10x slower


Let's try with a much larger pickle ~3GB
https://www.kaggle.com/code/columbia2131/speed-up-reading-csv-to-pickle/input



In [None]:

train = pandas.read_pickle('../data/ump-train-picklefile/train.pkl')

compress_pickle_b2z("../data/ump-train-picklefile/train.pbz2", train)
compress_pickle_gzip("../data/ump-train-picklefile/train.gzip", train)




In [2]:
train = pandas.read_pickle('../data/ump-train-picklefile/train.pkl')


In [3]:

compress_pickle_blosc2("../data/ump-train-picklefile/train.blosc2", train.to_numpy())

ValueError: src cannot be larger than 2147483615 bytes