In [1]:
import polars as pl
 
import os


# Start timer

# get current directory
path = os.getcwd()
print("Current Directory", path)
 
# prints parent directory
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
input_dir = f"{parent_dir}/data"



Current Directory /Users/NachoCorcuera/PycharmProjects/Pandas-Polars-PySpark-BenchMark/notebooks


# Polars Lazy

In [2]:

def join_df():
    regions = [
        "Europe", 
        "North America", 
        "Asia", 
        "Sub-Saharan Africa", 
        "Central America and the Caribbean", 
        "Middle East and North Africa", 
        "Australia and Oceania"
    ]

    aliases = [
        "EU", 
        "NA", 
        "AS", 
        "SSA", 
        "CA", 
        "MENA", 
        "AUS"
    ]

    # Create the DataFrame
    df_regions = pl.DataFrame({
        "Region": regions,
        "Alias": aliases
    }).lazy()

    # Display the DataFrame to verify its contents
    return df_regions

In [3]:

import time

def processing_df(file):
    times = {}
    start_time_full = time.time()

    ## Reading time
    df = pl.scan_csv(f"{input_dir}/{file}.csv")
    end_time = time.time()
    times["read_csv"] = end_time - start_time_full

    ## Filtering time
    start_time = time.time()
    filtered_polars = df.filter(pl.col('Total Profit') > 2000)
    end_time = time.time()
    times["filter"] = end_time - start_time

    ## Aggregation
    start_time = time.time()
    df.group_by('Region').agg(pl.col('Total Profit').sum().alias('sales'),
                            pl.col('Total Profit').mean().alias('sales_mean'),
                            pl.col('Total Profit').max().alias('sales_max'),
                            pl.col('Total Profit').min().alias('sales_min'),
                            pl.col('Total Profit').median().alias('sales_median'))
    end_time = time.time()
    times["aggregation"] = end_time - start_time


    ## Joining time
    start_time = time.time()
    df_regions = join_df()
    df_joined = df.join(df_regions, on="Region", how="left")
    end_time = time.time()
    times["join"] = end_time - start_time


    ## Writting time
    start_time = time.time()
    df_regions = join_df()
    df_joined.collect(streaming=True).write_csv("testing_write_2.csv")
    end_time = time.time()
    times["write"] = end_time - start_time
    return times

In [4]:
file_list = ["sales_50000",
             "sales_250000",
             "sales_1000000",
             "sales_5000000",
             "sales_25000000"]


times = {file: processing_df(file) for file in file_list}

In [12]:

import time
import psutil
def processing_df_parquet(file):
    times = {}
    start_time_full = time.time()
    process = psutil.Process(os.getpid())
    mem_before = process.memory_info().rss / 1024 ** 2  # Convert bytes to MB
    cpu_before = process.cpu_percent(interval=None)
    ## Reading time
    df = pl.scan_parquet(f"{input_dir}/{file}/*.parquet")
    end_time = time.time()
    times["read_csv"] = end_time - start_time_full

    ## Filtering time
    start_time = time.time()
    filtered_polars = df.filter(pl.col('Total Profit') > 2000)
    end_time = time.time()
    times["filter"] = end_time - start_time

    ## Aggregation
    start_time = time.time()
    df.group_by('Region').agg(pl.col('Total Profit').sum().alias('sales'),
                            pl.col('Total Profit').mean().alias('sales_mean'),
                            pl.col('Total Profit').max().alias('sales_max'),
                            pl.col('Total Profit').min().alias('sales_min'),
                            pl.col('Total Profit').median().alias('sales_median'))
    end_time = time.time()
    times["aggregation"] = end_time - start_time


    ## Joining time
    start_time = time.time()
    df_regions = join_df()
    df_joined = df.join(df_regions, on="Region", how="left")
    end_time = time.time()
    times["join"] = end_time - start_time


    ## Writting time
    start_time = time.time()
    df_regions = join_df()
    df_joined.collect(streaming=True).write_parquet("testing_write_2")
    end_time = time.time()
    times["write"] = end_time - start_time
    
    print(psutil.cpu_percent())
    print(psutil.virtual_memory())  # physical memory usage
    print('memory % used:', psutil.virtual_memory()[2])
    return times

In [13]:
file = "parquet/sales_25000000"
x = processing_df_parquet(file)

9.2
svmem(total=34359738368, available=13390675968, percent=61.0, used=5322358784, free=6151897088, active=2801582080, inactive=6037389312, wired=2520776704)
memory % used: 61.0
