In [73]:
import pandas as pd
import numpy as np
import polars as pl
from collections import Counter
import time
import polars as pl
from pympler import asizeof


In [79]:
def generate_data(size):
    if isinstance(size, list):
        pdls, plls, npls = [], [], []
        for s in size:
            data = pd.DataFrame({
                    'numerical_0': np.random.rand(s),
                    'numerical_1': np.random.rand(s),
                    'category_0': np.random.randint(5, size=s),
                    'category_1': np.random.randint(8, size=s),
                })
            pdls.append(data)
            plls.append(pl.from_dataframe(data))
            npls.append(data.values)
    return pdls, plls, npls
size = [10000, 1000000, 5000000, 10000000, 50000000]
pdls, plls, npls = generate_data(size)
category_name = ['category_0', 'category_1']
numerical_name = ['numerical_0', 'numerical_1']

size = asizeof.asizeof(pdls)
print(f"Pandas Object Size of object: {size} bytes")
size = asizeof.asizeof(plls)
print(f"Polars Object Size of object: {size} bytes")

Pandas Object Size of object: 3168521504 bytes
Polars Object Size of object: 936 bytes


## **Single Feature Transformer**

#### **1. Count**

In [3]:
### pandas
for data in pdls:
    start_time = time.time()
    temp = data[category_name[0]].value_counts()
    x = data.merge(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.002215862274169922
1000000 Sample Execution time： 0.03449082374572754
5000000 Sample Execution time： 0.17963290214538574
10000000 Sample Execution time： 0.3504457473754883
50000000 Sample Execution time： 1.621446132659912


In [4]:
### polars
for data in plls:
    start_time = time.time()
    temp = data.group_by(category_name[0]).agg([
        pl.col(category_name[0]).count().alias("counts")
    ])
    _ = data.join(temp, on=category_name[0], how="left")
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0020034313201904297
1000000 Sample Execution time： 0.009187459945678711
5000000 Sample Execution time： 0.04319405555725098
10000000 Sample Execution time： 0.08982324600219727
50000000 Sample Execution time： 0.4042835235595703


In [5]:
### numpy
for data in npls:
    start_time = time.time()
    temp = Counter(data[:, 2])
    result = np.zeros_like(data[:, 2])
    for i, val in enumerate(data[:, 2]):
        if not np.isnan(val):
            if isinstance(val, list) or isinstance(val, np.ndarray): val = val[0]
            result[i] = temp[val]
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.013328075408935547
1000000 Sample Execution time： 0.9126410484313965
5000000 Sample Execution time： 4.73424220085144
10000000 Sample Execution time： 9.089803457260132
50000000 Sample Execution time： 43.22467517852783


#### **2. Delay**

In [6]:
### pandas
for data in pdls:
    start_time = time.time()
    temp = data[numerical_name[0]].shift(10)
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0
1000000 Sample Execution time： 0.0020036697387695312
5000000 Sample Execution time： 0.006906747817993164
10000000 Sample Execution time： 0.01806044578552246
50000000 Sample Execution time： 0.04733538627624512


In [7]:
### polars
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].shift(10).alias("shift")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.012961387634277344
1000000 Sample Execution time： 0.0
5000000 Sample Execution time： 0.0
10000000 Sample Execution time： 0.0
50000000 Sample Execution time： 0.0


In [8]:
### numpy
for data in npls:
    start_time = time.time()
    delayed_col = np.roll(data[:, 0], 10)
    delayed_col = delayed_col.astype(float)
    delayed_col[:10] = np.nan  
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0005056858062744141
1000000 Sample Execution time： 0.0025081634521484375
5000000 Sample Execution time： 0.011437416076660156
10000000 Sample Execution time： 0.02332019805908203
50000000 Sample Execution time： 0.08516168594360352


#### **3. TSUM**

In [9]:
### pandas
for data in pdls:
    start_time = time.time()
    temp = data[numerical_name[0]].rolling(10).sum()
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0012521743774414062
1000000 Sample Execution time： 0.012411117553710938
5000000 Sample Execution time： 0.056128501892089844
10000000 Sample Execution time： 0.11829137802124023
50000000 Sample Execution time： 0.6118054389953613


In [10]:
### polars
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_sum(10).alias("tsum")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.013611078262329102
1000000 Sample Execution time： 0.003021717071533203
5000000 Sample Execution time： 0.022639989852905273
10000000 Sample Execution time： 0.06169581413269043
50000000 Sample Execution time： 0.3012354373931885


In [11]:
### numpy
def rolling_sum(x, window):
    sum_arr = np.full_like(x, np.nan, dtype=np.float64)
    for i in range(window - 1, len(x)):
        sum_arr[i] = np.sum(x[i - window + 1:i + 1])
    return sum_arr
for data in npls:
    start_time = time.time()
    rolling_sum(data[:, 0], 10)
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.022855520248413086
1000000 Sample Execution time： 2.1963748931884766
5000000 Sample Execution time： 10.922600984573364
10000000 Sample Execution time： 21.817306518554688
50000000 Sample Execution time： 108.69869565963745


## Intermediate Conclusion

- **Observation 1**: Numpy hasn`t plenty functions to transformer data.
- **Observation 2**: Compared to pandas or polars, the calculate time is can`t tolerate, so abandon numpy.
- **Observation 3**: Polars is faster than pandas.

#### **4. TMAX, TMEAN, TMEDIAN, TMIN, TQUANTILE, TSKEW, TSTD, TVAR**

#### According to Observation above, just test for the time series operators polars is faster than pandas

In [12]:
### pandas
pandas_spend_times = []
for data in pdls:
    # tmax
    start_time = time.time()
    data['tmax'] = data[numerical_name[0]].rolling(10).max()
    pandas_spend_times.append(time.time() - start_time)
    # tmean
    start_time = time.time()
    data['tmean'] = data[numerical_name[0]].rolling(10).mean()
    pandas_spend_times.append(time.time() - start_time)
    # tmedian
    start_time = time.time()
    data['tmedian'] = data[numerical_name[0]].rolling(10).median()
    pandas_spend_times.append(time.time() - start_time)
    # tmin
    start_time = time.time()
    data['tmin'] = data[numerical_name[0]].rolling(10).min()
    pandas_spend_times.append(time.time() - start_time)
    # tquantile
    start_time = time.time()
    data['tquantile'] = data[numerical_name[0]].rolling(10).quantile(q=0.5)
    pandas_spend_times.append(time.time() - start_time)
    # tskew
    start_time = time.time()
    data['tskew'] = data[numerical_name[0]].rolling(10).skew()
    pandas_spend_times.append(time.time() - start_time)
    # tstd
    start_time = time.time()
    data['tstd'] = data[numerical_name[0]].rolling(10).std()
    pandas_spend_times.append(time.time() - start_time)
    # tvar
    start_time = time.time()
    data['tvar'] = data[numerical_name[0]].rolling(10).var()
    pandas_spend_times.append(time.time() - start_time)

In [13]:
### polars
polars_spend_times = []
for data in plls:
    # tmax
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_max(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tmean
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_mean(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tmedian
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_median(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tmin
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_min(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tquantile
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_quantile(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tskew
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_skew(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tstd
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_std(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)
    # tvar
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_var(10).alias("tsum")
    ])
    polars_spend_times.append(time.time() - start_time)

In [14]:
op_name = ['TMAX', 'TMEAN', 'TMEDIAN', 'TMIN', 'TQUANTILE', 'TSKEW', 'TSTD', 'TVAR'] * 5
for i, (a, b) in enumerate(zip(pandas_spend_times, polars_spend_times)):
    print(f"{size[i//8]} Samples", op_name[i % len(op_name)], " Execution time difference:\t", a - b)

10000 Samples TMAX  Execution time difference:	 -0.01311182975769043
10000 Samples TMEAN  Execution time difference:	 0.0
10000 Samples TMEDIAN  Execution time difference:	 0.0020508766174316406
10000 Samples TMIN  Execution time difference:	 0.0
10000 Samples TQUANTILE  Execution time difference:	 0.001016378402709961
10000 Samples TSKEW  Execution time difference:	 -0.0117340087890625
10000 Samples TSTD  Execution time difference:	 0.0010104179382324219
10000 Samples TVAR  Execution time difference:	 0.0
1000000 Samples TMAX  Execution time difference:	 0.011887311935424805
1000000 Samples TMEAN  Execution time difference:	 0.005140066146850586
1000000 Samples TMEDIAN  Execution time difference:	 0.12295031547546387
1000000 Samples TMIN  Execution time difference:	 0.009814023971557617
1000000 Samples TQUANTILE  Execution time difference:	 0.160841703414917
1000000 Samples TSKEW  Execution time difference:	 -1.1319470405578613
1000000 Samples TSTD  Execution time difference:	 0.00792

### **Interesting thing is rolling_skew polars is slower than pandas**

- One reason is polars and pandas the result is different.
- We try the Mathematically defined skew by using numpy and scripy, the result is same with the polars.

#### This is a example in pandas
```
ser = pd.Series([1, 5, 2, 7, 15, 6])
ser.rolling(3).skew().round(6)
0         NaN
1         NaN
2    1.293343
3   -0.585583
4    0.670284
5    1.652317
dtype: float64
```
**skew([1,5,2]) should be 0.5280, but the results is 1.293343, we don`t check the pandas resource code, for Feature Engingeer, we perfer polars result**

In [15]:
pdls[0][numerical_name[0]].rolling(10).skew()[9]

0.9906771678583391

In [16]:
plls[0].select([plls[0][numerical_name[0]].rolling_skew(10).alias("tskew")])[9]

tskew
f64
0.835412


In [17]:
def calculate_skewness(data):
    n = len(data)
    mean = np.mean(data)
    std_dev = np.std(data, ddof=0)
    
    skewness = (np.sum((data - mean) ** 3) / n) / (std_dev ** 3)
    
    return skewness
x = pdls[0][numerical_name[0]].values[:10]
calculate_skewness(x)

0.8354123403619467

In [18]:
from scipy.stats import skew
skew(np.array([1, 5, 2]))

0.5280049792181881

In [19]:
calculate_skewness(np.array([1, 5, 2]))

0.528004979218188

#### **5. DELTA, TSARGMAX, TSARGMIN, STDDEV, DECAYLINEAR**
the operate is no built-in functions both in pandas and polars

polars execute take lots of time, we don`t run it all

In [20]:
### pandas
for data in pdls:
    start_time = time.time()
    temp = data[numerical_name[0]].rolling(10).apply(lambda x: x.argmin(), raw=True, engine='numba')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 1.0883605480194092
1000000 Sample Execution time： 0.2853522300720215
5000000 Sample Execution time： 0.557382345199585
10000000 Sample Execution time： 0.8201844692230225
50000000 Sample Execution time： 3.512913703918457


In [21]:
### polars
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_map(
        window_size=10, function=lambda x: int(pl.Series(x).arg_max())).alias("rolling_argmax")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.08661174774169922
1000000 Sample Execution time： 6.907378673553467
5000000 Sample Execution time： 23.615180253982544
10000000 Sample Execution time： 46.79049873352051
50000000 Sample Execution time： 236.20098090171814


In [22]:
### polars
weights = np.linspace(0.1, 1, int(10))
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        data[numerical_name[0]].rolling_map(
        window_size=10, function=lambda x: (x * weights).sum()).alias("DECAYLINEAR")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)


10000 Sample Execution time： 0.12421202659606934
1000000 Sample Execution time： 5.395846605300903
5000000 Sample Execution time： 26.17410135269165
10000000 Sample Execution time： 52.41760063171387
50000000 Sample Execution time： 263.3735523223877


### **Interesting thing is apply or map function polars is slower than pandas,**
### **on the other hand pandas+numba is better than polars for this situation.**
#### **why!**
#### **The ChatGPT`s explain is:**
1. **Numba Acceleration:** Pandas' rolling operation, combined with Numba, allows for just-in-time compilation of functions, which can significantly speed up computations with large datasets. Numba often performs faster than libraries built on interpreted languages (like Python or Rust's higher-level libraries) when handling numerical calculations.

2. **Polars Internal Implementation:** Polars' rolling_map function might not have the same just-in-time compilation capabilities as Numba. While Polars is based on Rust and generally performs excellently, in this specific case (using custom functions), it might not optimize the operation as effectively as Numba.

3. **Efficiency of Rolling Operations:** Different libraries implement rolling operations with varying underlying logic, which can lead to differences in efficiency. For instance, Polars may have more complex or additional steps when handling rolling_map, which could slow down the process compared to Pandas.

#### **6.Sine, Cosine, Softmax**

In [23]:
### pandas
pandas_spend_times = []
def softmax(x):
    e_x = np.exp(x - np.max(x))  # 对输入值进行缩放以避免数值不稳定性
    return e_x / e_x.sum()
for data in pdls:
    # sine
    start_time = time.time()
    data['sin'] = np.sin(data[numerical_name[0]])
    pandas_spend_times.append(time.time() - start_time)
    # cos
    start_time = time.time()
    data['cos'] = np.cos(data[numerical_name[0]])
    pandas_spend_times.append(time.time() - start_time)
    # softmax
    start_time = time.time()
    data['softmax'] = softmax(data[numerical_name[0]].values)
    pandas_spend_times.append(time.time() - start_time)

In [24]:
### polars
polars_spend_times = []
def softmax(x):
    e_x = np.exp(x - np.max(x))  # 对输入值进行缩放以避免数值不稳定性
    return e_x / e_x.sum()
for data in plls:
    # sine
    start_time = time.time()
    temp = data.with_columns([
        pl.col(numerical_name[0]).sin().alias('sine')
    ])
    polars_spend_times.append(time.time() - start_time)
    # cos
    start_time = time.time()
    temp = data.with_columns([
        pl.col(numerical_name[0]).sin().alias('cos')
    ])
    polars_spend_times.append(time.time() - start_time)
    # softmax
    start_time = time.time()
    temp = data.with_columns([
        pl.Series(softmax(data[numerical_name[0]].to_numpy())).alias("softmax_values")
    ])
    polars_spend_times.append(time.time() - start_time)

In [25]:
op_name = ['Sine', 'Cosine', 'Softmax']
for i, (a, b) in enumerate(zip(pandas_spend_times, polars_spend_times)):
    print(f"{size[i//3]} Samples", op_name[i % len(op_name)], " Execution time difference:\t", a - b)

10000 Samples Sine  Execution time difference:	 -0.016792774200439453
10000 Samples Cosine  Execution time difference:	 0.0
10000 Samples Softmax  Execution time difference:	 -0.0010035037994384766
10000 Samples Sine  Execution time difference:	 0.003069639205932617
10000 Samples Cosine  Execution time difference:	 -0.0009696483612060547
10000 Samples Softmax  Execution time difference:	 0.0010933876037597656
10000 Samples Sine  Execution time difference:	 0.00511932373046875
10000 Samples Cosine  Execution time difference:	 0.006043195724487305
1000000 Samples Softmax  Execution time difference:	 0.009743690490722656
1000000 Samples Sine  Execution time difference:	 0.008233070373535156
1000000 Samples Cosine  Execution time difference:	 -0.0029811859130859375
1000000 Samples Softmax  Execution time difference:	 0.007904767990112305
1000000 Samples Sine  Execution time difference:	 0.04054903984069824
1000000 Samples Cosine  Execution time difference:	 -0.08980989456176758
1000000 Sam

#### Polars is not better than pandas, that is interesting, but the running time is accepetable

## **Double Feature Transformer**
'add', 'divide', 'multiply', 'subtract', 'bigger', 'smaller', 'equal', 'minimize', 'maximize', 'tcorvariance', 'tcorrelation', 'aggregate', 'crosscount', 'nunique'

**Before testing, We feel the speed of operators that can be directly calculated will be very close. Let`s check and see the result of aggregate and rolling.**

#### **7.'add', 'divide', 'multiply', 'subtract', 'bigger', 'smaller', 'equal', 'minimize', 'maximize'**

In [27]:
### pandas
pandas_spend_times = []
for data in pdls:
    # add
    start_time = time.time()
    data['add'] = data[numerical_name[0]] + data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # divide
    start_time = time.time()
    data['divide'] = data[numerical_name[0]] / data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # multiply
    start_time = time.time()
    data['multiply'] = data[numerical_name[0]] * data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # subtract
    start_time = time.time()
    data['subtract'] = data[numerical_name[0]] - data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # bigger
    start_time = time.time()
    data['bigger'] = data[numerical_name[0]] > data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # smaller
    start_time = time.time()
    data['smaller'] = data[numerical_name[0]] < data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # equal
    start_time = time.time()
    data['equal'] = data[numerical_name[0]] == data[numerical_name[1]]
    pandas_spend_times.append(time.time() - start_time)
    # minimize
    start_time = time.time()
    data['minimize'] = np.min([data[numerical_name[0]], data[numerical_name[1]]], axis=0)
    pandas_spend_times.append(time.time() - start_time)
    # maximize
    start_time = time.time()
    data['maximize'] = np.max([data[numerical_name[0]], data[numerical_name[1]]], axis=0)
    pandas_spend_times.append(time.time() - start_time)

In [32]:
### polars
polars_spend_times = []
for data in plls:
    # add
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) + pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # divide
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) / pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # multiply
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) * pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # subtract
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) - pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # bigger
    start_time = time.time()
    dtemp = data.with_columns([
        (pl.col(numerical_name[0]) > pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # smaller
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) < pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # equal
    start_time = time.time()
    temp = data.with_columns([
        (pl.col(numerical_name[0]) == pl.col(numerical_name[1])).alias('add')
    ])
    polars_spend_times.append(time.time() - start_time)
    # minimize
    start_time = time.time()
    temp = data.with_columns([
        pl.when(pl.col(numerical_name[0]) < pl.col(numerical_name[1])).then(pl.col(numerical_name[0]))
        .otherwise(pl.col(numerical_name[1]))
        .alias("smller_value")
    ])
    polars_spend_times.append(time.time() - start_time)
    # maximize
    start_time = time.time()
    temp = data.with_columns([
        pl.when(pl.col(numerical_name[0]) > pl.col(numerical_name[1])).then(pl.col(numerical_name[0]))
        .otherwise(pl.col(numerical_name[1]))
        .alias("greater_value")
    ])
    polars_spend_times.append(time.time() - start_time)

In [36]:
op_name = ['add', 'divide', 'multiply', 'subtract', 'bigger', 'smaller', 'equal', 'minimize', 'maximize']
for i, (a, b) in enumerate(zip(pandas_spend_times, polars_spend_times)):
    print(f"{size[i//9]} Samples", op_name[i % len(op_name)], " Execution time difference:\t", a - b)

10000 Samples add  Execution time difference:	 -0.0010025501251220703
10000 Samples divide  Execution time difference:	 0.0010051727294921875
10000 Samples multiply  Execution time difference:	 0.0
10000 Samples subtract  Execution time difference:	 0.0
10000 Samples bigger  Execution time difference:	 0.0
10000 Samples smaller  Execution time difference:	 0.0
10000 Samples equal  Execution time difference:	 0.0
10000 Samples minimize  Execution time difference:	 -0.009007453918457031
10000 Samples maximize  Execution time difference:	 0.0
1000000 Samples add  Execution time difference:	 -6.9141387939453125e-06
1000000 Samples divide  Execution time difference:	 -0.0002696514129638672
1000000 Samples multiply  Execution time difference:	 0.002007007598876953
1000000 Samples subtract  Execution time difference:	 0.001003265380859375
1000000 Samples bigger  Execution time difference:	 0.0010037422180175781
1000000 Samples smaller  Execution time difference:	 0.001003265380859375
1000000 

#### **8.'tcorvariance', 'tcorrelation'**

In [45]:
for data in pdls:
    # tcorvariance
    start_time = time.time()
    data['tcorvariance'] = data[numerical_name[0]].rolling(10).corr(data[numerical_name[1]])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0010640621185302734
1000000 Sample Execution time： 0.08747148513793945
5000000 Sample Execution time： 0.40174102783203125
10000000 Sample Execution time： 0.8168542385101318
50000000 Sample Execution time： 4.162893295288086


In [47]:
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        pl.rolling_corr(numerical_name[0], numerical_name[1], window_size=10).alias("tcorvariance")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0298616886138916
1000000 Sample Execution time： 0.03975105285644531
5000000 Sample Execution time： 0.20784521102905273
10000000 Sample Execution time： 0.40465521812438965
50000000 Sample Execution time： 2.06021785736084


In [51]:
for data in pdls:
    # tcorvariance
    start_time = time.time()
    data['tcorvariance'] = data[numerical_name[0]].rolling(10).cov(data[numerical_name[1]])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0010018348693847656
1000000 Sample Execution time： 0.0770108699798584
5000000 Sample Execution time： 0.270263671875
10000000 Sample Execution time： 0.5352280139923096
50000000 Sample Execution time： 2.7887940406799316


In [52]:
for data in plls:
    start_time = time.time()
    temp = data.with_columns([
        pl.rolling_cov(numerical_name[0], numerical_name[1], window_size=10).alias("tcorvariance")
    ])
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.0035583972930908203
1000000 Sample Execution time： 0.026164770126342773
5000000 Sample Execution time： 0.19764947891235352
10000000 Sample Execution time： 0.3112671375274658
50000000 Sample Execution time： 1.4020442962646484


#### **9.'aggregate', 'crosscount', 'nunique'**

In [62]:
# pandas
for data in pdls:
    # aggmean
    start_time = time.time()
    temp = data.groupby([category_name[0]])[numerical_name[0]].agg('sum').rename('aggmean')
    _ = data.merge(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.39626193046569824
1000000 Sample Execution time： 0.09432721138000488
5000000 Sample Execution time： 0.48881030082702637
10000000 Sample Execution time： 0.9089641571044922
50000000 Sample Execution time： 5.315707683563232


In [63]:
# polars
for data in plls:
    start_time = time.time()
    temp = data.group_by(category_name[0]).agg(pl.sum(numerical_name[0]).alias('aggmean'))
    result = data.join(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.027961015701293945
1000000 Sample Execution time： 0.0075075626373291016
5000000 Sample Execution time： 0.026412487030029297
10000000 Sample Execution time： 0.056580305099487305
50000000 Sample Execution time： 0.35039710998535156


In [64]:
# pandas
for data in pdls:
    # aggmean
    start_time = time.time()
    temp = data.groupby([category_name[0]])[numerical_name[0]].nunique().rename('nunique')
    _ = data.merge(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.014368057250976562
1000000 Sample Execution time： 0.1665959358215332
5000000 Sample Execution time： 1.256286859512329
10000000 Sample Execution time： 2.826300859451294
50000000 Sample Execution time： 15.91285490989685


In [66]:
# polars
for data in plls:
    start_time = time.time()
    temp = data.group_by(category_name[0]).agg(pl.col(numerical_name[0]).n_unique().alias('nunique'))
    result = data.join(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.018143177032470703
1000000 Sample Execution time： 0.01438140869140625
5000000 Sample Execution time： 0.07217621803283691
10000000 Sample Execution time： 0.1534109115600586
50000000 Sample Execution time： 0.6692502498626709


In [67]:
# pandas
for data in pdls:
    # aggmean
    start_time = time.time()
    temp = data.groupby([category_name[0]])[numerical_name[0]].count().rename('nunique')
    _ = data.merge(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.33628177642822266
1000000 Sample Execution time： 0.10019993782043457
5000000 Sample Execution time： 0.47657275199890137
10000000 Sample Execution time： 1.0682213306427002
50000000 Sample Execution time： 5.361292362213135


In [68]:
# polars
for data in plls:
    start_time = time.time()
    temp = data.group_by(category_name[0]).agg(pl.col(numerical_name[0]).count().alias('nunique'))
    result = data.join(temp, on=category_name[0], how='left')
    print(f"{len(data)} Sample Execution time：", time.time() - start_time)

10000 Sample Execution time： 0.051223039627075195
1000000 Sample Execution time： 0.008140087127685547
5000000 Sample Execution time： 0.03714179992675781
10000000 Sample Execution time： 0.10254335403442383
50000000 Sample Execution time： 0.4471867084503174


### **Just like we say before, the speed of operators that can be directly calculated is very close. For aggregate operators, polars is better.**

# Conclusion

- **Memory**: polars uses much less memory than pandas. If you have the memory problem, just use polars.
- **Speed**: In fact, the operation speed of pandas and polars is similar, but polars does not have a ready-made interface for custom functions of rolling, and the speed is unacceptable. Thanks to numba, pandas' custom rolling functions are very fast. For aggregate, polars does not find a 'map' or 'apply' function to use custom functions. For other operations, polars is slightly faster than pandas overall.
- **Conclusion**: You can use polars and Use pandas+numba for custom function. Except for custom conversion functions, using polars is the best solution. 