In [3]:
import polars as pl
 
import os


# Start timer

# get current directory
path = os.getcwd()
print("Current Directory", path)
 
# prints parent directory
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
input_dir = f"{parent_dir}/data"



Current Directory /Users/NachoCorcuera/PycharmProjects/Pandas-Polars-PySpark-BenchMark/notebooks


# Polars Eager

In [4]:

def join_df():
    regions = [
        "Europe", 
        "North America", 
        "Asia", 
        "Sub-Saharan Africa", 
        "Central America and the Caribbean", 
        "Middle East and North Africa", 
        "Australia and Oceania"
    ]

    aliases = [
        "EU", 
        "NA", 
        "AS", 
        "SSA", 
        "CA", 
        "MENA", 
        "AUS"
    ]

    # Create the DataFrame
    df_regions = pl.DataFrame({
        "Region": regions,
        "Alias": aliases
    })

    # Display the DataFrame to verify its contents
    return df_regions

In [1]:

import time

def processing_df(file):
    times = {}
    start_time_full = time.time()

    ## Reading time
    df = pl.read_csv(f"{input_dir}/{file}.csv")
    end_time = time.time()
    times["read_csv"] = end_time - start_time_full

    ## Filtering time
    start_time = time.time()
    filtered_polars = df.filter(pl.col('Total Profit') > 2000)
    end_time = time.time()
    times["filter"] = end_time - start_time

    ## Aggregation
    start_time = time.time()
    df.group_by('Region').agg(pl.col('Total Profit').sum().alias('sales'),
                            pl.col('Total Profit').mean().alias('sales_mean'),
                            pl.col('Total Profit').max().alias('sales_max'),
                            pl.col('Total Profit').min().alias('sales_min'),
                            pl.col('Total Profit').median().alias('sales_median'))
    end_time = time.time()
    times["aggregation"] = end_time - start_time


    ## Joining time
    start_time = time.time()
    df_regions = join_df()
    df_joined = df.join(df_regions, on="Region", how="left")
    end_time = time.time()
    times["join"] = end_time - start_time


    ## Writting time
    start_time = time.time()
    df_regions = join_df()
    df_joined.write_csv("testing_write_2.csv")
    end_time = time.time()
    times["write"] = end_time - start_time
    return times

In [5]:
file_list = ["sales_50000",
             "sales_250000",
             "sales_1000000",
             "sales_5000000",
             "sales_25000000"]


times = {file: processing_df(file) for file in file_list}

In [6]:
times

{'sales_50000': {'read_csv': 0.040792226791381836,
  'filter': 0.0022542476654052734,
  'aggregation': 0.0019421577453613281,
  'join': 0.0011718273162841797,
  'write': 0.08605599403381348},
 'sales_250000': {'read_csv': 0.02921295166015625,
  'filter': 0.0054209232330322266,
  'aggregation': 0.006804943084716797,
  'join': 0.0028297901153564453,
  'write': 0.051733970642089844},
 'sales_1000000': {'read_csv': 0.0826117992401123,
  'filter': 0.019051790237426758,
  'aggregation': 0.03006911277770996,
  'join': 0.011890888214111328,
  'write': 0.1584300994873047},
 'sales_5000000': {'read_csv': 0.43853116035461426,
  'filter': 0.08569121360778809,
  'aggregation': 0.15650582313537598,
  'join': 0.056740760803222656,
  'write': 0.8676810264587402},
 'sales_25000000': {'read_csv': 2.2471492290496826,
  'filter': 3.014470100402832,
  'aggregation': 2.2853622436523438,
  'join': 0.3383932113647461,
  'write': 5.165363073348999}}

In [11]:

import time

def processing_df_parquet(file):
    times = {}
    start_time_full = time.time()

    ## Reading time
    df = pl.read_parquet(f"{input_dir}/{file}/*.parquet")
    end_time = time.time()
    times["read_csv"] = end_time - start_time_full

    ## Filtering time
    start_time = time.time()
    filtered_polars = df.filter(pl.col('Total Profit') > 2000)
    end_time = time.time()
    times["filter"] = end_time - start_time

    ## Aggregation
    start_time = time.time()
    df.group_by('Region').agg(pl.col('Total Profit').sum().alias('sales'),
                            pl.col('Total Profit').mean().alias('sales_mean'),
                            pl.col('Total Profit').max().alias('sales_max'),
                            pl.col('Total Profit').min().alias('sales_min'),
                            pl.col('Total Profit').median().alias('sales_median'))
    end_time = time.time()
    times["aggregation"] = end_time - start_time


    ## Joining time
    start_time = time.time()
    df_regions = join_df()
    df_joined = df.join(df_regions, on="Region", how="left")
    end_time = time.time()
    times["join"] = end_time - start_time


    ## Writting time
    start_time = time.time()
    df_regions = join_df()
    df_joined.write_parquet("testing_write_2")
    end_time = time.time()
    times["write"] = end_time - start_time
    return times

In [12]:
file = "parquet/sales_25000000"
processing_df_parquet(file)

{'read_csv': 3.8856701850891113,
 'filter': 1.0167460441589355,
 'aggregation': 0.48287105560302734,
 'join': 0.4307827949523926,
 'write': 21.223906755447388}