In [1]:
import polars as pl
import pandas as pd
import pyspark as ps

In [2]:
CSV_40M_PATH = 'data/Parking_Violations_40M.csv' # 9.65 GB
CSV_20M_PATH = 'data/Parking_Violations_20M.csv' # 4.89 GB
CSV_9M_PATH = 'data/Parking_Violations_9M.csv' # 1.95 GB
CSV_4M_PATH = 'data/Parking_Violations_4M.csv' # 855 MB
CSV_2M_PATH = 'data/Parking_Violations_2M.csv' # 428 MB
CSV_1M_PATH = 'data/Parking_Violations_1M.csv' # 214 MB
CSV_500K_PATH = 'data/Parking_Violations_500K.csv' # 107 MB

In [3]:
CSV_PATH = CSV_500K_PATH
OUT_JSON_NAME = 'CSV_500K_PATH'

In [4]:
comparison_results_dict = {
    'read_csv': {},
    'filter_one_condition': {},
    'filter_multiple_conditions': {},
    'create_new_column': {},
    'group_by': {}
}

# Let's compare

## 1. Read CSV

### 1.1. Polars

In [5]:
%%timeit -o
polars_df = pl.scan_csv(CSV_PATH, ignore_errors=True)
polars_df.collect()

113 ms ± 959 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 113 ms ± 959 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [6]:
comparison_results_dict['read_csv']['polars'] = _.best

### 1.2. Pandas

In [7]:
%%timeit -o
pandas_df = pd.read_csv(CSV_PATH, on_bad_lines='skip')

764 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 764 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [8]:
comparison_results_dict['read_csv']['pandas'] = _.best

In [9]:
polars_df = pl.scan_csv(CSV_PATH, ignore_errors=True)

In [10]:
pandas_df = pd.read_csv(CSV_PATH, on_bad_lines='skip')

## 2. Filter rows based on one condition

### 2.1. Polars

In [11]:
%%timeit -o
result_polars_df = polars_df.filter(pl.col(['Registration State']) == "NY")
result_polars_df.collect()

126 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 126 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [12]:
comparison_results_dict['filter_one_condition']['polars'] = _.best

### 2.2. Pandas

In [13]:
%%timeit -o
results_pandas_df = pandas_df.loc[pandas_df['Registration State'] == 'NY']

56.7 ms ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 56.7 ms ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [14]:
comparison_results_dict['filter_one_condition']['pandas'] = _.best

## 3. Filter rows based on multiple conditions

### 3.1. Polars

In [15]:
%%timeit -o
result_polars_df = polars_df.filter(
    (pl.col('Registration State') == 'NY') & \
    (pl.col('Summons Number') <= 1335096139) & \
    (pl.col('Summons Number') > 1335089433)
)
result_polars_df.collect()

119 ms ± 5.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 119 ms ± 5.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [16]:
comparison_results_dict['filter_multiple_conditions']['polars'] = _.best

### 3.2. Pandas

In [17]:
%%timeit -o
results_pandas_df = pandas_df.loc[
    (pandas_df['Registration State'] == 'NY') & \
    (pandas_df['Summons Number'] <= 1335096139) & \
    (pandas_df['Summons Number'] > 1335089433)
]

19.3 ms ± 940 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 19.3 ms ± 940 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [18]:
comparison_results_dict['filter_multiple_conditions']['pandas'] = _.best

## 4. Create new column based on another

### 4.1. Polars

In [19]:
%%timeit -o
result_polars_df = polars_df.with_columns([
    (pl.col('Violation Code') + 10_000).alias('Clean Violation Code')
])
result_polars_df.collect()

117 ms ± 3.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 117 ms ± 3.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [20]:
comparison_results_dict['create_new_column']['polars'] = _.best

### 4.2. Pandas

In [21]:
%%timeit -o
pandas_df['Clean Violation Code'] = pandas_df['Violation Code'].apply(lambda x: x + 10_000)

74 ms ± 560 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 74 ms ± 560 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [22]:
comparison_results_dict['create_new_column']['pandas'] = _.best

## 5. GroupBy

### 5.1. Polars

In [23]:
%%timeit -o
result_polars_df = polars_df.group_by(by='Registration State').count()
result_polars_df.collect()



27.9 ms ± 917 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 27.9 ms ± 917 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [24]:
comparison_results_dict['group_by']['polars'] = _.best

### 5.2. Pandas

In [25]:
%%timeit -o
grouped_pandas_df = pandas_df.groupby(by='Registration State').count()

227 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 227 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [26]:
comparison_results_dict['group_by']['pandas'] = _.best

## 6. RESULTS

In [27]:
import json
print(json.dumps(comparison_results_dict, indent=4))

{
    "read_csv": {
        "polars": 0.11123097079980653,
        "pandas": 0.7458537500060629
    },
    "filter_one_condition": {
        "polars": 0.11893107080104529,
        "pandas": 0.05369687500060536
    },
    "filter_multiple_conditions": {
        "polars": 0.11459862080082531,
        "pandas": 0.01839585541994893
    },
    "create_new_column": {
        "polars": 0.11311812499479856,
        "pandas": 0.07323276669922052
    },
    "group_by": {
        "polars": 0.02663726669998141,
        "pandas": 0.21668441699875984
    }
}


In [28]:
with open(f'{OUT_JSON_NAME}_without_read.json', 'w') as file:
    json.dump(comparison_results_dict, file, indent=4)