# Comparison

In this notebook we will compare the following methods:
- Pandas
- PyArrow with Pandas
- Polars (CPU-based)
- Polars with CUDA acceleration

#### Dataset information
![dataset_information](./public/dataset_information.png)

### Time to load CSV

In [1]:
import time
import pandas as pd
import polars as pl

import pyarrow.compute as pc
import pyarrow as pa


def time_dataframe_operations(df_pandas, df_polars):
    """
    Performs filtering, grouping, and sorting on a DataFrame using:
    - Pandas
    - PyArrow with Pandas
    - Polars (CPU)
    - Polars (GPU) (if available)
    
    Measures execution time for each method.
    
    Returns a dictionary with execution times.
    """
    results = {}

    # Pandas
    start = time.time()
    df_pandas_filtered = df_pandas[df_pandas[" Destination Port"] > 4000]
    df_pandas_sorted = df_pandas_filtered.sort_values(by=" Destination Port")
    results["Pandas"] = time.time() - start

    # Pandas with PyArrow backend (still using Pandas API)
    start = time.time()
    df_arrow = pa.Table.from_pandas(df_pandas)
    df_arrow_filtered = df_arrow.filter(pc.greater(df_arrow[" Destination Port"], 4000))
    df_arrow_sorted = df_arrow_filtered.take(pc.sort_indices(df_arrow_filtered, sort_keys=[(" Destination Port", "ascending")]))
    results["PyArrow with Pandas"] = time.time() - start

    # Polars (CPU)
    start = time.time()
    df_polars_filtered = df_polars.filter(pl.col(" Destination Port") > 4000)
    df_polars_sorted = df_polars_filtered.sort(" Destination Port")
    results["Polars (CPU)"] = time.time() - start

    return results


In [2]:
file_path = "./data/concat.csv"

df_pandas = pd.read_csv(file_path)
df_polars = pl.read_csv(file_path)

timings = time_dataframe_operations(df_pandas=df_pandas, df_polars=df_polars)

for method, time_taken in timings.items():
	print(f"{method}: {time_taken:.4f} seconds")

Pandas: 0.5041 seconds
PyArrow with Pandas: 0.6617 seconds
Polars (CPU): 0.2960 seconds
