In [1]:
import pandas as pd
import polars as pl
import timeit
import plotly.express as px

In [2]:
def measure_performance(func, *args, iterations=10, **kwargs):
    # Using timeit to run the function multiple times and return the average time
    timer = timeit.Timer(lambda: func(*args, **kwargs))
    time_taken = timer.timeit(number=iterations) / iterations  # Average time per run
    return time_taken, func(*args, **kwargs)

results = pd.DataFrame(columns=['Library', 'Operation', 'Time'])

def add_results(library, operation, time):
    global results
    new_row = pd.DataFrame({
        'Library': [library],
        'Operation': [operation],
        'Time': [time],
    })
    results = pd.concat([results, new_row], ignore_index=True)

In [3]:

#Loading the data from the file
def load_data_pandas(file_path):
    return pd.read_csv(file_path)

def load_data_polars(file_path):
    return pl.read_csv(file_path)

# Filtering the data based on a column value
def filter_data_pandas(df, column, value):
    return df[df[column] == value]

def filter_data_polars(df, column, value):
    return df.filter(pl.col(column) == value)

# Grouping the data based on a column and aggregating another column
def group_aggregate_pandas(df, group_column, agg_column):
    return df.groupby(group_column)[agg_column].agg('sum')

def group_aggregate_polars(df, group_column, agg_column):
    return df.lazy().group_by(group_column).agg(pl.col(agg_column).sum()).collect()

# Sorting the data based on a column
def sort_data_pandas(df, column):
    return df.sort_values(by=column)

def sort_data_polars(df, column):
    return df.sort(column)

### Data Source
CSV file used for testing can be downloaded from https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents?resource=download

CSV file is **3GB** in size.

In [4]:
time_pandas, data_pandas = measure_performance(load_data_pandas, 'US_Accidents_March23.csv', iterations=1)
add_results('Pandas', 'Load Data', time_pandas)

time_polars, data_polars = measure_performance(load_data_polars, 'US_Accidents_March23.csv', iterations=1)
add_results('Polars', 'Load Data', time_polars)

print("Time taken to load data using Pandas: ", time_pandas)
print("Time taken to load data using Polars: ", time_polars)

  results = pd.concat([results, new_row], ignore_index=True)


Time taken to load data using Pandas:  90.51498837501276
Time taken to load data using Polars:  13.99401445798867


In [5]:
time_pandas, _ = measure_performance(filter_data_pandas, data_pandas, 'Station', True, iterations=10)
add_results('Pandas', 'Filter Data', time_pandas)

time_polars, _ = measure_performance(filter_data_polars, data_polars, 'Station', True, iterations=10)
add_results('Polars', 'Filter Data', time_polars)

print("Time taken to load data using Pandas: ", time_pandas)
print("Time taken to load data using Polars: ", time_polars)

Time taken to load data using Pandas:  4.208029666700168
Time taken to load data using Polars:  1.3851179042001605


In [6]:
# Grouping and Aggregating Data
time_pandas, _ = measure_performance(group_aggregate_pandas, data_pandas, 'State', 'Distance(mi)', iterations=100)
add_results('Pandas', 'Group and Aggregate', time_pandas)   

time_polars, _ = measure_performance(group_aggregate_polars, data_polars, 'State', 'Distance(mi)', iterations=100)
add_results('Polars', 'Group and Aggregate', time_polars)

print("Time taken to load data using Pandas: ", time_pandas)
print("Time taken to load data using Polars: ", time_polars)

Time taken to load data using Pandas:  0.20177918291999958
Time taken to load data using Polars:  0.10071729082992534


In [7]:

# Sorting Data
time_pandas, _ = measure_performance(sort_data_pandas, data_pandas, 'Distance(mi)', iterations=5)
add_results('Pandas', 'Sort Data', time_pandas)

time_polars, _ = measure_performance(sort_data_polars, data_polars, 'Distance(mi)', iterations=5)
add_results('Polars', 'Sort Data', time_polars)

print("Time taken to load data using Pandas: ", time_pandas)
print("Time taken to load data using Polars: ", time_polars)

Time taken to load data using Pandas:  10.223441408399959
Time taken to load data using Polars:  5.8738187250011835


In [8]:


# Plotting the results using Plotly
fig = px.bar(results, x='Operation', y='Time', color='Library', barmode='group',
             title="Time Comparison Between Pandas and Polars", text='Time')
fig.update_layout(yaxis_type='log')
fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')

fig.show()
