In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from store_data import Base
from store_data import conf
from store_data import Util

import shutil

import polars as pl

folder = conf.f_test_data
shutil.rmtree(folder)
if not folder.exists():
    folder.mkdir()

In [None]:
schema = [("eliotCode", pl.Utf8), ("date", pl.Datetime), ("volume", pl.Float64)]
schema_encode = [("eliotCode", pl.UInt32), ("date", pl.UInt16)]

class DataStockHistoEncoded(Base):
    """"""
    
    def __init__(self):
        """"""
        super().__init__(folder / "encoded", schema, schema_encode)
        
    def read(self, eliot_codes=None, dates=None):
        """"""
        fpath = self.fpath_data
        
        if not fpath.is_file():
            df = Util.empty_df(self.schema)
            return df
        
        df = pl.read_parquet(fpath)
        
        dic_col_values = {"eliotCode": eliot_codes, "date": dates}
        df = self.decode_filter(df, dic_col_values)
        
        return df
    
    def save(self, df):
        """"""
        fpath = self.fpath_data
        df_to_save = self.encode(df)
        df_to_save.write_parquet(fpath)    
        
        
d_encoded = DataStockHistoEncoded()

In [None]:
schema = [("eliotCode", pl.Utf8), ("date", pl.Datetime), ("volume", pl.Float64)]
schema_encode = []

class DataStockHisto(Base):
    """"""
    
    def __init__(self):
        """"""
        super().__init__(folder / "not-encoded", schema, schema_encode)
        
    def read(self, eliot_codes=None, dates=None):
        """"""
        fpath = self.fpath_data
        
        if not fpath.is_file():
            df = Util.empty_df(self.schema)
            return df
        
        df = pl.read_parquet(fpath)
        df = self.decode(df)
        
        if eliot_codes is not None:
            mask = pl.col("eliotCode").is_in(eliot_codes)
            df = df.filter(mask)
        if dates is not None:
            mask = pl.col("date").is_in(dates)
            df = df.filter(mask)
        
        return df
    
    def save(self, df):
        """"""
        fpath = self.fpath_data
        df_to_save = self.encode(df)
        df_to_save.write_parquet(fpath)    
        
        
d = DataStockHisto()

In [None]:
import datetime as dt
import numpy as np

# Eliot Codes
n_codes = 50000
eliot_codes = np.random.randint(1000, 1000000000, n_codes)
df_codes = pl.DataFrame({"eliotCode": eliot_codes}).unique()

# Dates
n_dates = 1000
start_date = dt.datetime(2000, 1, 1)
end_date = dt.datetime(2023, 1, 1)
dates = pl.date_range(start_date, end_date, "1d")
dates = dates[:n_dates]

# Data
data = {"eliotCode": eliot_codes, "date": [dates] * len(eliot_codes)}
df = pl.DataFrame(data).explode("date")

# Values
n = len(df)
volumes = np.random.rand(n)
df = pl.concat((df, pl.DataFrame({"volume": volumes})), how="horizontal")
df = Util.apply_schema(df, schema)

In [None]:
d.save(df)

In [None]:
d_encoded.save(df)

In [None]:
import gc
del df
gc.collect()

In [None]:
%time df1 = d.read(eliot_codes=list(map(str, eliot_codes[:100])))
print(df1)

In [None]:
%time df2 = d_encoded.read(eliot_codes=list(map(str, eliot_codes[:100])))
print(df2)

In [None]:
import tracemalloc

tracemalloc.start()
df1 = d.read(eliot_codes=list(map(str, eliot_codes[:100])))
del df1
gc.collect()
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

In [None]:
import tracemalloc

tracemalloc.start()
df1 = d_encoded.read(eliot_codes=list(map(str, eliot_codes[:100])))
del df1
gc.collect()
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

In [None]:
# importing libraries
import os
import psutil
 
# inner psutil function
def process_memory():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss
 
# decorator function
def profile(func):
    def wrapper(*args, **kwargs):
 
        mem_before = process_memory()
        result = func(*args, **kwargs)
        mem_after = process_memory()
        print("{}:consumed memory: {:,}".format(
            func.__name__,mem_after - mem_before))
 
        return result
    return wrapper


@profile
def test_1():
    """"""
    import time
    df1 = d_encoded.read(eliot_codes=list(map(str, eliot_codes[:100])))
    time.sleep(5)
    return df1

@profile
def test_2():
    """"""
    import time
    df2 = d.read(eliot_codes=list(map(str, eliot_codes[:100])))
    time.sleep(5)
    return df2

df1 = test_1()
df2 = test_2()

In [None]:
from collections import Counter
import linecache
import os
import tracemalloc

def display_top(snapshot, key_type='lineno', limit=3):
    snapshot = snapshot.filter_traces((
        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
        tracemalloc.Filter(False, "<unknown>"),
    ))
    top_stats = snapshot.statistics(key_type)

    print("Top %s lines" % limit)
    for index, stat in enumerate(top_stats[:limit], 1):
        frame = stat.traceback[0]
        # replace "/path/to/module/file.py" with "module/file.py"
        filename = os.sep.join(frame.filename.split(os.sep)[-2:])
        print("#%s: %s:%s: %.1f KiB"
              % (index, filename, frame.lineno, stat.size / 1024))
        line = linecache.getline(frame.filename, frame.lineno).strip()
        if line:
            print('    %s' % line)

    other = top_stats[limit:]
    if other:
        size = sum(stat.size for stat in other)
        print("%s other: %.1f KiB" % (len(other), size / 1024))
    total = sum(stat.size for stat in top_stats)
    print("Total allocated size: %.1f KiB" % (total / 1024))


tracemalloc.start()
df1 = d_encoded.read(eliot_codes=list(map(str, eliot_codes[:100])))
snapshot = tracemalloc.take_snapshot()
display_top(snapshot)