In [None]:
df.info(memory_usage='deep')

In [None]:
%config InteractiveShell.cache_size = 0

In [None]:
import sys

def get_var_size():
    var_sizes = [(var_name, sys.getsizeof(var_val)) for var_name, var_val in globals().items()]
    sorted_vars = sorted(var_sizes, key=lambda x: x[1], reverse=True)
    
    print("{:<30} {:<15}".format("Variable Name","Size (bytes)"))
    print("="*45)
    for var_name, size in sorted_vars:
    print("{:<30} {:<15}".format(var_name, size))

## Table and parquet format

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

###### Write a df into a parquet file, using the table format and group logic

In [None]:
table = pa.Table.from_pandas(df)
row_group_size = 2048

with pq.ParquetWriter('file_path.parquet', table.schema) as writer:
    start_idx = 0
    while start_idx < len(df):
        end_idx = start_idx + row_group_size
        writer.write_table(table.slice(start_idx, row_group_size))
        start_idx = end_idx

###### Parquet file and table attributes

In [None]:
parquet_file = pq.ParquetFile('file_path.parquet') # this a parquet_file
parquet_file.schema
parquet_file.metadata.num_rows
parquet_file.metadata.num_columns
parquet_file.metadata.num_row_groups

parquet_file.read_row_group(0)                           # This is a table
parquet_file.read_row_group(0).schema                        # With dtypes
parquet_file.read_row_group(0).schema.names                  # Columns names
parquet_file.read_row_group(0).shape
parquet_file.read_row_group(0).num_rows
parquet_file.read_row_group(0).num_columns
parquet_file.read_row_group(0).slice(offset=10, length=20)   # Equivalent de iloc
parquet_file.read_row_group(0).to_pandas()

In [None]:
my_schema = pa.schema([pa.field('id_min', pa.int64()),
                       pa.field('id_max', pa.int64()),
                       pa.field('sequence', pa.string())])

parquet_file.read_row_group(0).cast(target_schema=my_schema)

###### Read a parquet file, using a group logic

In [None]:
parquet_file = pq.ParquetFile('file_path.parquet')
for i in range(parquet_file.metadata.num_row_groups):
    df = parquet_file.read_row_group(i).to_pandas()

## Time management

In [None]:
from datetime import datetime
import pytz
print(datetime.now(pytz.timezone('Europe/Paris')).strftime('%H:%M'))

In [None]:
import time
start_time = time.perf_counter()
print(f"Duration: {time.perf_counter() - start_time:0.1f} sec")

In [None]:
#To be used as decorator. Track time and RAM
import time
import tracemalloc

def simple_time_and_memory_tracker(method):
    # ### Log Level
    # 0: Nothing
    # 1: Print Time and Memory usage of functions
    LOG_LEVEL = 1
    def method_with_trackers(*args, **kw):
        ts = time.time()
        tracemalloc.start()
        result = method(*args, **kw)
        _, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        te = time.time()
        duration = te - ts
        if LOG_LEVEL > 0:
            print("{:<20} {:<15} {:<15}".format(f"\033[1m{method.__qualname__}", f"\033[0m{round(duration, 2)} sec", f"{round(peak / 1024**2,2)} RAM MB"))
        return result
    return method_with_trackers

## Print with color

In [None]:
# Write in blue
print("\033[94m{} {}\033[0m".format("word 1", "word 2"))

# Write in bold
print("\033[1m{}\033[0m {}".format("word 1", "word 2"))