## **Polars vs. Pandas: A Performance Comparison**

In [1]:
import pandas as pd
import polars as pl
import time
from scipy import stats

### Information about the dataset  

In [56]:
df =  pd.read_csv("customers-100000.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Index              100000 non-null  int64 
 1   Customer Id        100000 non-null  object
 2   First Name         100000 non-null  object
 3   Last Name          100000 non-null  object
 4   Company            100000 non-null  object
 5   City               100000 non-null  object
 6   Country            100000 non-null  object
 7   Phone 1            100000 non-null  object
 8   Phone 2            100000 non-null  object
 9   Email              100000 non-null  object
 10  Subscription Date  100000 non-null  object
 11  Website            100000 non-null  object
dtypes: int64(1), object(11)
memory usage: 9.2+ MB


### **Reading Dataset**

### Measure and Store Execution Times for Pandas Read Dataset using 20 iterations

In [3]:
pandas_read_times = []

for _ in range(20):
    start_time = time.time()
    pd.read_csv("customers-100000.csv")
    end_time = time.time()
    pandas_read_times.append(end_time - start_time)

In [4]:
print(pandas_read_times)

[0.3856484889984131, 0.31491827964782715, 0.3032500743865967, 0.3113117218017578, 0.2687714099884033, 0.3482484817504883, 0.3178408145904541, 0.2963731288909912, 0.28725433349609375, 0.28554701805114746, 0.30215001106262207, 0.2966485023498535, 0.31543540954589844, 0.31882381439208984, 0.29779982566833496, 0.2851996421813965, 0.29770469665527344, 0.2870197296142578, 0.2797548770904541, 0.30231666564941406]


### Measure and Store Execution Times for Polars Read Dataset using 20 iterations

In [5]:
polars_read_times = []

for _ in range(20):
    start_time = time.time()
    pl.read_csv("customers-100000.csv")
    end_time = time.time()
    polars_read_times.append(end_time - start_time)

In [6]:
print(polars_read_times)

[0.1504812240600586, 0.0177152156829834, 0.013877391815185547, 0.013391733169555664, 0.0023415088653564453, 0.015124320983886719, 0.017832517623901367, 0.0009720325469970703, 0.020899534225463867, 0.010874032974243164, 0.011015653610229492, 0.005217790603637695, 0.016530513763427734, 0.010032415390014648, 0.012157678604125977, 0.010001659393310547, 0.0, 0.019686222076416016, 0.002538442611694336, 0.01967620849609375]


In [60]:
from tabulate import tabulate

read_data_df = pd.DataFrame({
    "Pandas Read Time(s)": pandas_read_times,
    "Polars Read Time(s)": polars_read_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(read_data_df)

+-----------------------+-----------------------+
|  Pandas Read Time(s)  |  Polars Read Time(s)  |
|-----------------------+-----------------------|
|    0.385648488998     |    0.150481224060     |
|    0.314918279648     |    0.017715215683     |
|    0.303250074387     |    0.013877391815     |
|    0.311311721802     |    0.013391733170     |
|    0.268771409988     |    0.002341508865     |
|    0.348248481750     |    0.015124320984     |
|    0.317840814590     |    0.017832517624     |
|    0.296373128891     |    0.000972032547     |
|    0.287254333496     |    0.020899534225     |
|    0.285547018051     |    0.010874032974     |
|    0.302150011063     |    0.011015653610     |
|    0.296648502350     |    0.005217790604     |
|    0.315435409546     |    0.016530513763     |
|    0.318823814392     |    0.010032415390     |
|    0.297799825668     |    0.012157678604     |
|    0.285199642181     |    0.010001659393     |
|    0.297704696655     |    0.000000000000     |


In [66]:
pandas_mean = read_data_df["Pandas Read Time(s)"].mean()
pandas_std = read_data_df["Pandas Read Time(s)"].std()
pandas_variance = read_data_df["Pandas Read Time(s)"].var()

polars_mean = read_data_df["Polars Read Time(s)"].mean()
polars_std = read_data_df["Polars Read Time(s)"].std()
polars_variance = read_data_df["Polars Read Time(s)"].var()

stat_summary = pd.DataFrame({
    "Mean (s)": [pandas_mean, polars_mean],
    "Standard Deviation (s)": [pandas_std,polars_std],
    "Variance (s)": [pandas_variance, polars_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.305100846291 |      0.025755547767      | 0.000663348241 |
| Polars | 0.018518304825 |      0.031731991224      | 0.001006919267 |
+--------+----------------+--------------------------+----------------+


### Performing T-Test on Reading times

In [7]:
t_stat_read , p_val_read = stats.ttest_ind(polars_read_times, pandas_read_times, alternative='less')

print("T-Test for Read Times")
print("t-statistic: ", t_stat_read)
print("p-value: ", p_val_read)

T-Test for Read Times
t-statistic:  -31.359686412656586
p-value:  4.538378429396734e-29


### **Write Data to CSV file**

In [67]:
pandas_df = pd.read_csv("customers-100000.csv")

In [68]:
polars_df = pl.read_csv("customers-100000.csv")

### Measure and Store Execution Times for Pandas Write Dataset to CSV file using 20 iterations

In [77]:
pandas_write_to_csv_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df.to_csv('pandas_output.csv',index=False)
    end_time = time.time()
    pandas_write_to_csv_times.append(end_time - start_time)

In [78]:
print(pandas_write_to_csv_times)

[0.432431697845459, 0.45446276664733887, 0.4810812473297119, 0.47162389755249023, 0.45369625091552734, 0.459505558013916, 0.5128598213195801, 0.445955753326416, 0.45746946334838867, 0.48566651344299316, 0.4668152332305908, 0.4652435779571533, 0.49457216262817383, 0.597353458404541, 0.6302549839019775, 0.6551485061645508, 0.6008255481719971, 0.5718770027160645, 0.4561188220977783, 0.45444703102111816]


### Measure and Store Execution Times for Polars Write Dataset to CSV file using 20 iterations

In [79]:
polars_write_to_csv_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.write_csv('polars_output.csv', separator=",")
    end_time = time.time()
    polars_write_to_csv_times.append(end_time - start_time)

In [80]:
print(polars_write_to_csv_times)

[0.03309798240661621, 0.028300762176513672, 0.02667069435119629, 0.024573326110839844, 0.024127960205078125, 0.02414536476135254, 0.024581193923950195, 0.03577733039855957, 0.026615381240844727, 0.0256803035736084, 0.017366409301757812, 0.030039310455322266, 0.023212432861328125, 0.0302274227142334, 0.034825801849365234, 0.028035879135131836, 0.02636098861694336, 0.02729058265686035, 0.03333640098571777, 0.02897787094116211]


In [83]:
write_data_to_csv_df = pd.DataFrame({
    "Pandas Write to CSV Time(s)": pandas_write_to_csv_times,
    "Polars Write to CSV Time(s)": polars_write_to_csv_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(write_data_to_csv_df)

+-------------------------------+-------------------------------+
|  Pandas Write to CSV Time(s)  |  Polars Write to CSV Time(s)  |
|-------------------------------+-------------------------------|
|        0.432431697845         |        0.033097982407         |
|        0.454462766647         |        0.028300762177         |
|        0.481081247330         |        0.026670694351         |
|        0.471623897552         |        0.024573326111         |
|        0.453696250916         |        0.024127960205         |
|        0.459505558014         |        0.024145364761         |
|        0.512859821320         |        0.024581193924         |
|        0.445955753326         |        0.035777330399         |
|        0.457469463348         |        0.026615381241         |
|        0.485666513443         |        0.025680303574         |
|        0.466815233231         |        0.017366409302         |
|        0.465243577957         |        0.030039310455         |
|        0

In [86]:
pandas_write_to_csv_mean = write_data_to_csv_df["Pandas Write to CSV Time(s)"].mean()
pandas_write_to_csv_std = write_data_to_csv_df["Pandas Write to CSV Time(s)"].std()
pandas_write_to_csv_variance = write_data_to_csv_df["Pandas Write to CSV Time(s)"].var()

polars_write_to_csv_mean = write_data_to_csv_df["Polars Write to CSV Time(s)"].mean()
polars_write_to_csv_std = write_data_to_csv_df["Polars Write to CSV Time(s)"].std()
polars_write_to_csv_variance = write_data_to_csv_df["Polars Write to CSV Time(s)"].var()

stat_summary = pd.DataFrame({
    "Mean (s)": [pandas_write_to_csv_mean, polars_write_to_csv_mean],
    "Standard Deviation (s)": [pandas_write_to_csv_std,polars_write_to_csv_std],
    "Variance (s)": [pandas_write_to_csv_variance, polars_write_to_csv_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.502370464802 |      0.068349213039      | 0.004671614923 |
| Polars | 0.027662169933 |      0.004418625420      | 0.000019524251 |
+--------+----------------+--------------------------+----------------+


### Performing T-Test on Write to CSV times

In [87]:
t_stat_write_to_csv , p_val_write_to_csv = stats.ttest_ind(polars_write_to_csv_times, pandas_write_to_csv_times, alternative='less')

print("T-Test for Write Data to CSV file Times")
print("t-statistic: ", t_stat_write_to_csv)
print("p-value: ", p_val_write_to_csv)

T-Test for Write Data to CSV file Times
t-statistic:  -30.995786411460706
p-value:  6.957580967944935e-29


### **Write Data to Parquet file**

### Measure and Store Execution Times for Pandas Write Dataset to Parquet file using 20 iterations

In [90]:
pandas_write_to_parquet_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df.to_parquet('pandas_output.parquet')
    end_time = time.time()
    pandas_write_to_parquet_times.append(end_time - start_time)

In [91]:
print(pandas_write_to_parquet_times)

[0.32501959800720215, 0.3165011405944824, 0.3393990993499756, 0.3629286289215088, 0.31961822509765625, 0.30922818183898926, 0.3326303958892822, 0.30814290046691895, 0.334226131439209, 0.3284883499145508, 0.29613208770751953, 0.29943346977233887, 0.3098869323730469, 0.30683064460754395, 0.3180882930755615, 0.3166513442993164, 0.2978370189666748, 0.30652928352355957, 0.3046247959136963, 0.3073997497558594]


### Measure and Store Execution Times for Polars Write Dataset to Parquet file using 20 iterations

In [92]:
polars_write_to_parquet_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.write_parquet('polars_output.parquet')
    end_time = time.time()
    polars_write_to_parquet_times.append(end_time - start_time)

In [93]:
print(polars_write_to_parquet_times)

[0.5004165172576904, 0.3854656219482422, 0.4469780921936035, 0.20106029510498047, 0.26842761039733887, 0.22554302215576172, 0.20602035522460938, 0.21454930305480957, 0.23242402076721191, 0.25059008598327637, 0.24349188804626465, 0.2320857048034668, 0.24013233184814453, 0.3035876750946045, 0.24606537818908691, 0.22883176803588867, 0.22032880783081055, 0.2376854419708252, 0.2939438819885254, 0.27007532119750977]


In [94]:
write_data_to_parquet_df = pd.DataFrame({
    "Pandas Write to Parquet Time(s)": pandas_write_to_parquet_times,
    "Polars Write to Parquet Time(s)": polars_write_to_parquet_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(write_data_to_parquet_df)

+-----------------------------------+-----------------------------------+
|  Pandas Write to Parquet Time(s)  |  Polars Write to Parquet Time(s)  |
|-----------------------------------+-----------------------------------|
|          0.325019598007           |          0.500416517258           |
|          0.316501140594           |          0.385465621948           |
|          0.339399099350           |          0.446978092194           |
|          0.362928628922           |          0.201060295105           |
|          0.319618225098           |          0.268427610397           |
|          0.309228181839           |          0.225543022156           |
|          0.332630395889           |          0.206020355225           |
|          0.308142900467           |          0.214549303055           |
|          0.334226131439           |          0.232424020767           |
|          0.328488349915           |          0.250590085983           |
|          0.296132087708           | 

In [95]:
pandas_write_to_parquet_mean = write_data_to_parquet_df["Pandas Write to Parquet Time(s)"].mean()
pandas_write_to_parquet_std = write_data_to_parquet_df["Pandas Write to Parquet Time(s)"].std()
pandas_write_to_parquet_variance = write_data_to_parquet_df["Pandas Write to Parquet Time(s)"].var()

polars_write_to_parquet_mean = write_data_to_parquet_df["Polars Write to Parquet Time(s)"].mean()
polars_write_to_parquet_std = write_data_to_parquet_df["Polars Write to Parquet Time(s)"].std()
polars_write_to_parquet_variance = write_data_to_parquet_df["Polars Write to Parquet Time(s)"].var()

stat_summary_write_parquet = pd.DataFrame({
    "Mean (s)": [pandas_write_to_parquet_mean, polars_write_to_parquet_mean],
    "Standard Deviation (s)": [pandas_write_to_parquet_std,polars_write_to_parquet_std],
    "Variance (s)": [pandas_write_to_parquet_variance, polars_write_to_parquet_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_write_parquet)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.316979813576 |      0.016409216349      | 0.000269262381 |
| Polars | 0.272385156155 |      0.080731147319      | 0.006517518148 |
+--------+----------------+--------------------------+----------------+


### Performing T-Test on Write to Parquet times

In [97]:
t_stat_write_to_parquet , p_val_write_to_parquet = stats.ttest_ind(polars_write_to_parquet_times, pandas_write_to_parquet_times, alternative='less')

print("T-Test for Write Data to Parquet file Times")
print("t-statistic: ", t_stat_write_to_parquet)
print("p-value: ", p_val_write_to_parquet)

T-Test for Write Data to Parquet file Times
t-statistic:  -2.420839111310295
p-value:  0.010185307033551751


### **Write Data to JSON file**

### Measure and Store Execution Times for Pandas Write Dataset to JSON file using 20 iterations

In [99]:
pandas_write_to_json_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df.to_json('pandas_output.json', orient="index")
    end_time = time.time()
    pandas_write_to_json_times.append(end_time - start_time)

In [100]:
print(pandas_write_to_json_times)

[0.23279547691345215, 0.2716064453125, 0.244675874710083, 0.2401883602142334, 0.24387836456298828, 0.2461385726928711, 0.25565266609191895, 0.25196051597595215, 0.227736234664917, 0.25084376335144043, 0.2649712562561035, 0.2676582336425781, 0.24881649017333984, 0.2500128746032715, 0.24962782859802246, 0.266965389251709, 0.2648475170135498, 0.25253725051879883, 0.2664835453033447, 0.25122785568237305]


### Measure and Store Execution Times for Polars Write Dataset to JSON file using 20 iterations

In [101]:
polars_write_to_json_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.write_json('polars_output.json')
    end_time = time.time()
    polars_write_to_json_times.append(end_time - start_time)

In [102]:
print(polars_write_to_json_times)

[0.057068824768066406, 0.06760048866271973, 0.06700563430786133, 0.04719877243041992, 0.07580709457397461, 0.05890679359436035, 0.0621793270111084, 0.05191373825073242, 0.06731724739074707, 0.059154510498046875, 0.06137394905090332, 0.06372404098510742, 0.0751657485961914, 0.05871844291687012, 0.06691694259643555, 0.049376726150512695, 0.06612825393676758, 0.06907081604003906, 0.06366753578186035, 0.0522921085357666]


In [103]:
write_data_to_json_df = pd.DataFrame({
    "Pandas Write to Json Time(s)": pandas_write_to_json_times,
    "Polars Write to Json Time(s)": polars_write_to_json_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(write_data_to_json_df)

+--------------------------------+--------------------------------+
|  Pandas Write to Json Time(s)  |  Polars Write to Json Time(s)  |
|--------------------------------+--------------------------------|
|         0.232795476913         |         0.057068824768         |
|         0.271606445312         |         0.067600488663         |
|         0.244675874710         |         0.067005634308         |
|         0.240188360214         |         0.047198772430         |
|         0.243878364563         |         0.075807094574         |
|         0.246138572693         |         0.058906793594         |
|         0.255652666092         |         0.062179327011         |
|         0.251960515976         |         0.051913738251         |
|         0.227736234665         |         0.067317247391         |
|         0.250843763351         |         0.059154510498         |
|         0.264971256256         |         0.061373949051         |
|         0.267658233643         |         0.063

In [104]:
pandas_write_to_json_mean = write_data_to_json_df["Pandas Write to Json Time(s)"].mean()
pandas_write_to_json_std = write_data_to_json_df["Pandas Write to Json Time(s)"].std()
pandas_write_to_json_variance = write_data_to_json_df["Pandas Write to Json Time(s)"].var()

polars_write_to_json_mean = write_data_to_json_df["Polars Write to Json Time(s)"].mean()
polars_write_to_json_std = write_data_to_json_df["Polars Write to Json Time(s)"].std()
polars_write_to_json_variance = write_data_to_json_df["Polars Write to Json Time(s)"].var()

stat_summary_write_json = pd.DataFrame({
    "Mean (s)": [pandas_write_to_json_mean, polars_write_to_json_mean],
    "Standard Deviation (s)": [pandas_write_to_json_std,polars_write_to_json_std],
    "Variance (s)": [pandas_write_to_json_variance, polars_write_to_json_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_write_json)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.252431225777 |      0.011874822584      | 0.000141011411 |
| Polars | 0.062029349804 |      0.007857922342      | 0.000061746944 |
+--------+----------------+--------------------------+----------------+


### Performing T-Test on Write to Json times

In [105]:
t_stat_write_to_json , p_val_write_to_json = stats.ttest_ind(polars_write_to_json_times, pandas_write_to_json_times, alternative='less')

print("T-Test for Write Data to JSON file Times")
print("t-statistic: ", t_stat_write_to_json)
print("p-value: ", p_val_write_to_json)

T-Test for Write Data to JSON file Times
t-statistic:  -59.79940205138711
p-value:  1.6769762280264946e-39


### **Write Data to Excel file**

### Measure and Store Execution Times for Pandas Write Dataset to Excel file using 20 iterations

In [106]:
pandas_write_to_excel_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df.to_excel('pandas_output.xlsx')
    end_time = time.time()
    pandas_write_to_excel_times.append(end_time - start_time)

In [107]:
print(pandas_write_to_excel_times)

[22.542454719543457, 21.749833583831787, 21.291157245635986, 21.776752948760986, 21.85891103744507, 21.829519033432007, 22.24268126487732, 21.405731201171875, 21.525912046432495, 21.692293882369995, 22.00721001625061, 25.737661361694336, 23.556265592575073, 21.92091155052185, 22.52869486808777, 21.742626428604126, 21.847245454788208, 21.774834394454956, 21.713831663131714, 21.470564126968384]


### Measure and Store Execution Times for Polars Write Dataset to Excel file using 20 iterations

In [109]:
polars_write_to_excel_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.write_excel('polars_output.xlsx')
    end_time = time.time()
    polars_write_to_excel_times.append(end_time - start_time)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [110]:
print(polars_write_to_excel_times)

[14.78085446357727, 15.382524490356445, 14.228124618530273, 13.129313707351685, 13.31192398071289, 13.428699970245361, 13.446179866790771, 13.67948317527771, 13.497566938400269, 13.076212644577026, 13.429007768630981, 13.27914547920227, 13.245184898376465, 13.37810468673706, 13.288607835769653, 14.28550386428833, 27.444095611572266, 21.740318775177002, 13.31204628944397, 13.095399856567383]


In [111]:
write_data_to_excel_df = pd.DataFrame({
    "Pandas Write to Excel Time(s)": pandas_write_to_excel_times,
    "Polars Write to Excel Time(s)": polars_write_to_excel_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(write_data_to_excel_df)

+---------------------------------+---------------------------------+
|  Pandas Write to Excel Time(s)  |  Polars Write to Excel Time(s)  |
|---------------------------------+---------------------------------|
|         22.542454719543         |         14.780854463577         |
|         21.749833583832         |         15.382524490356         |
|         21.291157245636         |         14.228124618530         |
|         21.776752948761         |         13.129313707352         |
|         21.858911037445         |         13.311923980713         |
|         21.829519033432         |         13.428699970245         |
|         22.242681264877         |         13.446179866791         |
|         21.405731201172         |         13.679483175278         |
|         21.525912046432         |         13.497566938400         |
|         21.692293882370         |         13.076212644577         |
|         22.007210016251         |         13.429007768631         |
|         25.7376613

In [112]:
pandas_write_to_excel_mean = write_data_to_excel_df["Pandas Write to Excel Time(s)"].mean()
pandas_write_to_excel_std = write_data_to_excel_df["Pandas Write to Excel Time(s)"].std()
pandas_write_to_excel_variance = write_data_to_excel_df["Pandas Write to Excel Time(s)"].var()

polars_write_to_excel_mean = write_data_to_excel_df["Polars Write to Excel Time(s)"].mean()
polars_write_to_excel_std = write_data_to_excel_df["Polars Write to Excel Time(s)"].std()
polars_write_to_excel_variance = write_data_to_excel_df["Polars Write to Excel Time(s)"].var()

stat_summary_write_excel = pd.DataFrame({
    "Mean (s)": [pandas_write_to_excel_mean, polars_write_to_excel_mean],
    "Standard Deviation (s)": [pandas_write_to_excel_std,polars_write_to_excel_std],
    "Variance (s)": [pandas_write_to_excel_variance, polars_write_to_excel_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_write_excel)

+--------+-----------------+--------------------------+-----------------+
|        |    Mean (s)     |  Standard Deviation (s)  |  Variance (s)   |
|--------+-----------------+--------------------------+-----------------|
| Pandas | 22.110754621029 |      0.990072762602      | 0.980244075246  |
| Polars | 14.722914946079 |      3.550624870426      | 12.606936970486 |
+--------+-----------------+--------------------------+-----------------+


### Performing T-Test on Write to Excel times

In [113]:
t_stat_write_to_excel , p_val_write_to_excel = stats.ttest_ind(polars_write_to_excel_times, pandas_write_to_excel_times, alternative='less')

print("T-Test for Write Data to Excel file Times")
print("t-statistic: ", t_stat_write_to_excel)
print("p-value: ", p_val_write_to_excel)

T-Test for Write Data to Excel file Times
t-statistic:  -8.963296830140994
p-value:  3.262885994705023e-11


### **Read Data from Parquet file**

### Measure and Store Execution Times for Pandas Read Dataset from Parquet file using 20 iterations

In [114]:
pandas_read_parquet_times = []

for _ in range(20):
    start_time = time.time()
    pd.read_parquet('pandas_output.parquet')
    end_time = time.time()
    pandas_read_parquet_times.append(end_time - start_time)

In [115]:
print(pandas_read_parquet_times)

[0.5599982738494873, 0.25945329666137695, 0.2505967617034912, 0.2910025119781494, 0.26952672004699707, 0.24594402313232422, 0.2548079490661621, 0.26158666610717773, 0.23918628692626953, 0.24702787399291992, 0.2403092384338379, 0.24022936820983887, 0.24710822105407715, 0.23569893836975098, 0.24969244003295898, 0.27739977836608887, 0.25595951080322266, 0.29984617233276367, 0.2611579895019531, 0.23891305923461914]


### Measure and Store Execution Times for Polars Read Dataset from Parquet file using 20 iterations

In [116]:
polars_read_parquet_times = []

for _ in range(20):
    start_time = time.time()
    pl.read_parquet('polars_output.parquet')
    end_time = time.time()
    polars_read_parquet_times.append(end_time - start_time)

In [117]:
print(polars_read_parquet_times)

[0.09163355827331543, 0.01964426040649414, 0.016509294509887695, 0.014467477798461914, 0.01964735984802246, 0.01744556427001953, 0.020291566848754883, 0.017819643020629883, 0.027399063110351562, 0.01838374137878418, 0.014432430267333984, 0.01560211181640625, 0.018390178680419922, 0.021684885025024414, 0.018649578094482422, 0.014604568481445312, 0.015662670135498047, 0.008955240249633789, 0.015677690505981445, 0.0160675048828125]


In [118]:
read_parquet_df = pd.DataFrame({
    "Pandas Read Parquet Time(s)": pandas_read_parquet_times,
    "Polars Read Parquet Time(s)": polars_read_parquet_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(read_parquet_df)

+-------------------------------+-------------------------------+
|  Pandas Read Parquet Time(s)  |  Polars Read Parquet Time(s)  |
|-------------------------------+-------------------------------|
|        0.559998273849         |        0.091633558273         |
|        0.259453296661         |        0.019644260406         |
|        0.250596761703         |        0.016509294510         |
|        0.291002511978         |        0.014467477798         |
|        0.269526720047         |        0.019647359848         |
|        0.245944023132         |        0.017445564270         |
|        0.254807949066         |        0.020291566849         |
|        0.261586666107         |        0.017819643021         |
|        0.239186286926         |        0.027399063110         |
|        0.247027873993         |        0.018383741379         |
|        0.240309238434         |        0.014432430267         |
|        0.240229368210         |        0.015602111816         |
|        0

In [119]:
pandas_read_parquet_mean = read_parquet_df["Pandas Read Parquet Time(s)"].mean()
pandas_read_parquet_std = read_parquet_df["Pandas Read Parquet Time(s)"].std()
pandas_read_parquet_variance = read_parquet_df["Pandas Read Parquet Time(s)"].var()

polars_read_parquet_mean = read_parquet_df["Polars Read Parquet Time(s)"].mean()
polars_read_parquet_std = read_parquet_df["Polars Read Parquet Time(s)"].std()
polars_read_parquet_variance = read_parquet_df["Polars Read Parquet Time(s)"].var()

stat_summary_read_parquet = pd.DataFrame({
    "Mean (s)": [pandas_read_parquet_mean, polars_read_parquet_mean],
    "Standard Deviation (s)": [pandas_read_parquet_std,polars_read_parquet_std],
    "Variance (s)": [pandas_read_parquet_variance, polars_read_parquet_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_read_parquet)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.271272253990 |      0.070120663202      | 0.004916907408 |
| Polars | 0.021148419380 |      0.016980494320      | 0.000288337187 |
+--------+----------------+--------------------------+----------------+


### Performing T-Test on Read from Parquet times

In [121]:
t_stat_read_parquet , p_val_read_parquet = stats.ttest_ind(polars_read_parquet_times, pandas_read_parquet_times, alternative='less')

print("T-Test for Read Data from Parquet file Times")
print("t-statistic: ", t_stat_read_parquet)
print("p-value: ", p_val_read_parquet)

T-Test for Read Data from Parquet file Times
t-statistic:  -15.504205105882303
p-value:  2.5473297340829475e-18


### **Read Data from JSON file**

### Measure and Store Execution Times for Pandas Read Dataset from Json file using 20 iterations

In [122]:
pandas_read_json_times = []

for _ in range(20):
    start_time = time.time()
    pd.read_json('pandas_output.json')
    end_time = time.time()
    pandas_read_json_times.append(end_time - start_time)

In [123]:
print(pandas_read_json_times)

[22.186375617980957, 20.41320490837097, 21.97781538963318, 20.77087664604187, 20.712918996810913, 49.439499378204346, 20.758979320526123, 13.668343782424927, 14.57879114151001, 13.116662502288818, 14.858015060424805, 13.062777280807495, 14.745757341384888, 13.478694915771484, 17.69547176361084, 13.326582670211792, 13.045320749282837, 13.445399761199951, 12.529601335525513, 12.949241399765015]


### Measure and Store Execution Times for Polars Read Dataset from Json file using 20 iterations

In [124]:
polars_read_json_times = []

for _ in range(20):
    start_time = time.time()
    pl.read_json('polars_output.json')
    end_time = time.time()
    polars_read_json_times.append(end_time - start_time)

In [125]:
print(polars_read_json_times)

[0.3717777729034424, 0.2878262996673584, 0.27316927909851074, 0.2879066467285156, 0.30022764205932617, 0.3028092384338379, 0.3138234615325928, 0.33548974990844727, 0.33128881454467773, 0.3323802947998047, 0.3548860549926758, 0.3443148136138916, 0.34447216987609863, 0.32320594787597656, 0.34910154342651367, 0.3377983570098877, 0.3288712501525879, 0.31801652908325195, 0.3641853332519531, 0.3194553852081299]


In [128]:
read_json_df = pd.DataFrame({
    "Pandas Read Json Time(s)": pandas_read_json_times,
    "Polars Read Json Time(s)": polars_read_json_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(read_json_df)

+----------------------------+----------------------------+
|  Pandas Read Json Time(s)  |  Polars Read Json Time(s)  |
|----------------------------+----------------------------|
|      22.186375617981       |       0.371777772903       |
|      20.413204908371       |       0.287826299667       |
|      21.977815389633       |       0.273169279099       |
|      20.770876646042       |       0.287906646729       |
|      20.712918996811       |       0.300227642059       |
|      49.439499378204       |       0.302809238434       |
|      20.758979320526       |       0.313823461533       |
|      13.668343782425       |       0.335489749908       |
|      14.578791141510       |       0.331288814545       |
|      13.116662502289       |       0.332380294800       |
|      14.858015060425       |       0.354886054993       |
|      13.062777280807       |       0.344314813614       |
|      14.745757341385       |       0.344472169876       |
|      13.478694915771       |       0.3

In [129]:
pandas_read_json_mean = read_json_df["Pandas Read Json Time(s)"].mean()
pandas_read_json_std = read_json_df["Pandas Read Json Time(s)"].std()
pandas_read_json_variance = read_json_df["Pandas Read Json Time(s)"].var()

polars_read_json_mean = read_json_df["Polars Read Json Time(s)"].mean()
polars_read_json_std = read_json_df["Polars Read Json Time(s)"].std()
polars_read_json_variance = read_json_df["Polars Read Json Time(s)"].var()

stat_summary_read_json = pd.DataFrame({
    "Mean (s)": [pandas_read_json_mean, polars_read_json_mean],
    "Standard Deviation (s)": [pandas_read_json_std,polars_read_json_std],
    "Variance (s)": [pandas_read_json_variance, polars_read_json_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_read_json)

+--------+-----------------+--------------------------+-----------------+
|        |    Mean (s)     |  Standard Deviation (s)  |  Variance (s)   |
|--------+-----------------+--------------------------+-----------------|
| Pandas | 17.838016498089 |      8.246201581275      | 67.999840519016 |
| Polars | 0.326050329208  |      0.026226424321      | 0.000687825333  |
+--------+-----------------+--------------------------+-----------------+


### Performing T-Test on Read from Json times

In [130]:
t_stat_read_json , p_val_read_json = stats.ttest_ind(polars_read_json_times, pandas_read_json_times, alternative='less')

print("T-Test for Read Data from Json file Times")
print("t-statistic: ", t_stat_read_json)
print("p-value: ", p_val_read_json)

T-Test for Read Data from Json file Times
t-statistic:  -9.497160200497358
p-value:  7.019549800660768e-12


### **Read Data from Excel file**

### Measure and Store Execution Times for Pandas Read Dataset from Excel file using 20 iterations

In [131]:
pandas_read_excel_times = []

for _ in range(20):
    start_time = time.time()
    pd.read_excel('pandas_output.xlsx')
    end_time = time.time()
    pandas_read_excel_times.append(end_time - start_time)

In [132]:
print(pandas_read_excel_times)

[31.789873361587524, 31.80777931213379, 31.27123475074768, 31.972824573516846, 31.68897032737732, 31.856446504592896, 31.748987436294556, 32.34937334060669, 31.51480007171631, 31.427732229232788, 33.065611600875854, 31.669410705566406, 32.156553745269775, 31.408061027526855, 31.88960909843445, 32.00014901161194, 31.261844635009766, 31.79026961326599, 31.3572039604187, 31.97360348701477]


### Measure and Store Execution Times for Polars Read Dataset from Excel file using 20 iterations

In [134]:
polars_read_excel_times = []

for _ in range(20):
    start_time = time.time()
    pl.read_excel('polars_output.xlsx')
    end_time = time.time()
    polars_read_excel_times.append(end_time - start_time)

In [135]:
print(polars_read_excel_times)

[2.878387689590454, 2.098186492919922, 2.0792691707611084, 2.0442306995391846, 2.0875420570373535, 2.0272626876831055, 2.137852191925049, 1.9830899238586426, 2.085430383682251, 1.982229471206665, 2.016312599182129, 2.032428026199341, 2.1134681701660156, 2.0651166439056396, 2.016740322113037, 2.016263008117676, 2.0094127655029297, 1.9884428977966309, 1.9834575653076172, 1.9992616176605225]


In [136]:
read_excel_df = pd.DataFrame({
    "Pandas Read Excel Time(s)": pandas_read_excel_times,
    "Polars Read Excel Time(s)": polars_read_excel_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(read_excel_df)

+-----------------------------+-----------------------------+
|  Pandas Read Excel Time(s)  |  Polars Read Excel Time(s)  |
|-----------------------------+-----------------------------|
|       31.789873361588       |       2.878387689590        |
|       31.807779312134       |       2.098186492920        |
|       31.271234750748       |       2.079269170761        |
|       31.972824573517       |       2.044230699539        |
|       31.688970327377       |       2.087542057037        |
|       31.856446504593       |       2.027262687683        |
|       31.748987436295       |       2.137852191925        |
|       32.349373340607       |       1.983089923859        |
|       31.514800071716       |       2.085430383682        |
|       31.427732229233       |       1.982229471207        |
|       33.065611600876       |       2.016312599182        |
|       31.669410705566       |       2.032428026199        |
|       32.156553745270       |       2.113468170166        |
|       

In [138]:
pandas_read_excel_mean = read_excel_df["Pandas Read Excel Time(s)"].mean()
pandas_read_excel_std = read_excel_df["Pandas Read Excel Time(s)"].std()
pandas_read_excel_variance = read_excel_df["Pandas Read Excel Time(s)"].var()

polars_read_excel_mean = read_excel_df["Polars Read Excel Time(s)"].mean()
polars_read_excel_std = read_excel_df["Polars Read Excel Time(s)"].std()
polars_read_excel_variance = read_excel_df["Polars Read Excel Time(s)"].var()

stat_summary_read_excel = pd.DataFrame({
    "Mean (s)": [pandas_read_excel_mean, polars_read_excel_mean],
    "Standard Deviation (s)": [pandas_read_excel_std,polars_read_excel_std],
    "Variance (s)": [pandas_read_excel_variance, polars_read_excel_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_read_excel)

+--------+-----------------+--------------------------+----------------+
|        |    Mean (s)     |  Standard Deviation (s)  |  Variance (s)  |
|--------+-----------------+--------------------------+----------------|
| Pandas | 31.800016939640 |      0.417577495971      | 0.174370965141 |
| Polars | 2.082219219208  |      0.193176988780      | 0.037317348994 |
+--------+-----------------+--------------------------+----------------+


### Performing T-Test on Read from Excel times

In [139]:
t_stat_read_excel , p_val_read_excel = stats.ttest_ind(polars_read_excel_times, pandas_read_excel_times, alternative='less')

print("T-Test for Read Data from Excel file Times")
print("t-statistic: ", t_stat_read_excel)
print("p-value: ", p_val_read_excel)

T-Test for Read Data from Excel file Times
t-statistic:  -288.85718724545825
p-value:  2.062651067160632e-65


### **Select columns from DataFrame**

In [149]:
pandas_column_selection_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df[['First Name', 'Last Name', 'Country', 'Email', 'Phone 1']]
    end_time = time.time()
    pandas_column_selection_times.append(end_time - start_time)

In [150]:
print(pandas_column_selection_times)

[0.012253761291503906, 0.01804804801940918, 0.01895594596862793, 0.012786626815795898, 0.008208274841308594, 0.012540578842163086, 0.008560895919799805, 0.010679483413696289, 0.008788347244262695, 0.006304502487182617, 0.016321420669555664, 0.010601997375488281, 0.004426002502441406, 0.012724637985229492, 0.005577802658081055, 0.014722824096679688, 0.008519887924194336, 0.012165307998657227, 0.008699655532836914, 0.002504110336303711]


In [153]:
polars_column_selection_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.select(['First Name', 'Last Name', 'Country', 'Email', 'Phone 1'])
    end_time = time.time()
    polars_column_selection_times.append(end_time - start_time)

In [154]:
print(polars_column_selection_times)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0037996768951416016, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004414081573486328, 0.0, 0.0]


In [155]:
column_selection_df = pd.DataFrame({
    "Pandas Column Selection Time(s)": pandas_column_selection_times,
    "Polars Column Selection Time(s)": polars_column_selection_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(column_selection_df)

+-----------------------------------+-----------------------------------+
|  Pandas Column Selection Time(s)  |  Polars Column Selection Time(s)  |
|-----------------------------------+-----------------------------------|
|          0.012253761292           |          0.000000000000           |
|          0.018048048019           |          0.000000000000           |
|          0.018955945969           |          0.000000000000           |
|          0.012786626816           |          0.000000000000           |
|          0.008208274841           |          0.000000000000           |
|          0.012540578842           |          0.000000000000           |
|          0.008560895920           |          0.003799676895           |
|          0.010679483414           |          0.000000000000           |
|          0.008788347244           |          0.000000000000           |
|          0.006304502487           |          0.000000000000           |
|          0.016321420670           | 

In [156]:
pandas_column_selection_mean = column_selection_df["Pandas Column Selection Time(s)"].mean()
pandas_column_selection_std = column_selection_df["Pandas Column Selection Time(s)"].std()
pandas_column_selection_variance = column_selection_df["Pandas Column Selection Time(s)"].var()

polars_column_selection_mean = column_selection_df["Polars Column Selection Time(s)"].mean()
polars_column_selection_std = column_selection_df["Polars Column Selection Time(s)"].std()
polars_column_selection_variance = column_selection_df["Polars Column Selection Time(s)"].var()

stat_summary_column_selection = pd.DataFrame({
    "Mean (s)": [pandas_column_selection_mean, polars_column_selection_mean],
    "Standard Deviation (s)": [pandas_column_selection_std,polars_column_selection_std],
    "Variance (s)": [pandas_column_selection_variance, polars_column_selection_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_column_selection)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.010669505596 |      0.004362626363      | 0.000019032509 |
| Polars | 0.000410687923 |      0.001267994049      | 0.000001607809 |
+--------+----------------+--------------------------+----------------+


In [157]:
t_stat_column_selection , p_val_column_selection = stats.ttest_ind(polars_column_selection_times, pandas_column_selection_times, alternative='less')

print("T-Test for Column Selection Times")
print("t-statistic: ", t_stat_column_selection)
print("p-value: ", p_val_column_selection)

T-Test for Column Selection Times
t-statistic:  -10.09843606775373
p-value:  1.2992885822302755e-12


### **Filter Rows 'First Name' starts with 'J'**

In [170]:
pandas_filter_columns_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df[pandas_df['First Name'].str.startswith('J')]
    end_time = time.time()
    pandas_filter_columns_times.append(end_time - start_time)

In [171]:
print(pandas_filter_columns_times)

[0.011123180389404297, 0.0343475341796875, 0.017560720443725586, 0.011625051498413086, 0.032480478286743164, 0.019592761993408203, 0.01621699333190918, 0.01761174201965332, 0.030920028686523438, 0.019261598587036133, 0.015596389770507812, 0.016259431838989258, 0.03314018249511719, 0.018849611282348633, 0.016485929489135742, 0.030353069305419922, 0.018327713012695312, 0.017594099044799805, 0.014818906784057617, 0.03322792053222656]


In [172]:
polars_filter_columns_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.filter(pl.col('First Name').str.starts_with('J'))
    end_time = time.time()
    polars_filter_columns_times.append(end_time - start_time)

In [173]:
print(polars_filter_columns_times)

[0.01566147804260254, 0.0019261837005615234, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015261650085449219, 0.0010197162628173828, 0.0016796588897705078, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01563262939453125, 0.00028824806213378906]


In [174]:
filter_columns_df = pd.DataFrame({
    "Pandas Filter Columns Time(s)": pandas_filter_columns_times,
    "Polars Filter Columns Time(s)": polars_filter_columns_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(filter_columns_df)

+---------------------------------+---------------------------------+
|  Pandas Filter Columns Time(s)  |  Polars Filter Columns Time(s)  |
|---------------------------------+---------------------------------|
|         0.011123180389          |         0.015661478043          |
|         0.034347534180          |         0.001926183701          |
|         0.017560720444          |         0.000000000000          |
|         0.011625051498          |         0.000000000000          |
|         0.032480478287          |         0.000000000000          |
|         0.019592761993          |         0.000000000000          |
|         0.016216993332          |         0.000000000000          |
|         0.017611742020          |         0.000000000000          |
|         0.030920028687          |         0.015261650085          |
|         0.019261598587          |         0.001019716263          |
|         0.015596389771          |         0.001679658890          |
|         0.01625943

In [175]:
pandas_filter_columns_mean = filter_columns_df["Pandas Filter Columns Time(s)"].mean()
pandas_filter_columns_std = filter_columns_df["Pandas Filter Columns Time(s)"].std()
pandas_filter_columns_variance = filter_columns_df["Pandas Filter Columns Time(s)"].var()

polars_filter_columns_mean = filter_columns_df["Polars Filter Columns Time(s)"].mean()
polars_filter_columns_std = filter_columns_df["Polars Filter Columns Time(s)"].std()
polars_filter_columns_variance = filter_columns_df["Polars Filter Columns Time(s)"].var()

stat_summary_filter_columns = pd.DataFrame({
    "Mean (s)": [pandas_filter_columns_mean, polars_filter_columns_mean],
    "Standard Deviation (s)": [pandas_filter_columns_std,polars_filter_columns_std],
    "Variance (s)": [pandas_filter_columns_variance, polars_filter_columns_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_filter_columns)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.021269667149 |      0.007818824948      | 0.000061134024 |
| Polars | 0.002573478222 |      0.005609097414      | 0.000031461974 |
+--------+----------------+--------------------------+----------------+


In [176]:
t_stat_filter_columns , p_val_filter_columns = stats.ttest_ind(polars_filter_columns_times, pandas_filter_columns_times, alternative='less')

print("T-Test for Filter Columns Times")
print("t-statistic: ", t_stat_filter_columns)
print("p-value: ", p_val_filter_columns)

T-Test for Filter Columns Times
t-statistic:  -8.689043648463777
p-value:  7.284171420597395e-11


### **Sort DataFrame in to Ascending Order by 'First Name'**

In [181]:
pandas_sort_times = []

for _ in range(20):
    start_time = time.time()
    pandas_df.sort_values(by='First Name' , ascending=True)
    end_time = time.time()
    pandas_sort_times.append(end_time - start_time)

In [182]:
print(pandas_sort_times)

[0.08464479446411133, 0.10045456886291504, 0.08374691009521484, 0.09940791130065918, 0.09525775909423828, 0.08938741683959961, 0.09503364562988281, 0.08944988250732422, 0.0798799991607666, 0.10150957107543945, 0.09653925895690918, 0.08466434478759766, 0.08543777465820312, 0.09902000427246094, 0.08512210845947266, 0.10107421875, 0.07989048957824707, 0.08203530311584473, 0.10224103927612305, 0.0811774730682373]


In [183]:
polars_sort_times = []

for _ in range(20):
    start_time = time.time()
    polars_df.sort(by='First Name',descending=False)
    end_time = time.time()
    polars_sort_times.append(end_time - start_time)

In [184]:
print(polars_sort_times)

[0.03941774368286133, 0.022505998611450195, 0.02213287353515625, 0.0211789608001709, 0.01267695426940918, 0.028377056121826172, 0.02193903923034668, 0.013821125030517578, 0.020429611206054688, 0.01949596405029297, 0.02742147445678711, 0.019939899444580078, 0.02275538444519043, 0.01933431625366211, 0.017663240432739258, 0.019445180892944336, 0.015291929244995117, 0.022209644317626953, 0.020047903060913086, 0.022115707397460938]


In [185]:
sort_df = pd.DataFrame({
    "Pandas Sort Time(s)": pandas_sort_times,
    "Polars Sort Time(s)": polars_sort_times
})

def print_df(df):
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False, floatfmt=".12f", numalign="center"))

print_df(sort_df)

+-----------------------+-----------------------+
|  Pandas Sort Time(s)  |  Polars Sort Time(s)  |
|-----------------------+-----------------------|
|    0.084644794464     |    0.039417743683     |
|    0.100454568863     |    0.022505998611     |
|    0.083746910095     |    0.022132873535     |
|    0.099407911301     |    0.021178960800     |
|    0.095257759094     |    0.012676954269     |
|    0.089387416840     |    0.028377056122     |
|    0.095033645630     |    0.021939039230     |
|    0.089449882507     |    0.013821125031     |
|    0.079879999161     |    0.020429611206     |
|    0.101509571075     |    0.019495964050     |
|    0.096539258957     |    0.027421474457     |
|    0.084664344788     |    0.019939899445     |
|    0.085437774658     |    0.022755384445     |
|    0.099020004272     |    0.019334316254     |
|    0.085122108459     |    0.017663240433     |
|    0.101074218750     |    0.019445180893     |
|    0.079890489578     |    0.015291929245     |


In [186]:
pandas_sort_mean = sort_df["Pandas Sort Time(s)"].mean()
pandas_sort_std = sort_df["Pandas Sort Time(s)"].std()
pandas_sort_variance = sort_df["Pandas Sort Time(s)"].var()

polars_sort_mean = sort_df["Polars Sort Time(s)"].mean()
polars_sort_std = sort_df["Polars Sort Time(s)"].std()
polars_sort_variance = sort_df["Polars Sort Time(s)"].var()

stat_summary_sort = pd.DataFrame({
    "Mean (s)": [pandas_sort_mean, polars_sort_mean],
    "Standard Deviation (s)": [pandas_sort_std,polars_sort_std],
    "Variance (s)": [pandas_sort_variance, polars_sort_variance]
}, index=["Pandas", "Polars"])

def print_df2(df):
    print(tabulate(df, headers='keys', tablefmt='psql', floatfmt=".12f", numalign="center"))

print_df2(stat_summary_sort)

+--------+----------------+--------------------------+----------------+
|        |    Mean (s)    |  Standard Deviation (s)  |  Variance (s)  |
|--------+----------------+--------------------------+----------------|
| Pandas | 0.090798723698 |      0.008125746584      | 0.000066027758 |
| Polars | 0.021410000324 |      0.005694398667      | 0.000032426176 |
+--------+----------------+--------------------------+----------------+


In [188]:
t_stat_sort , p_val_sort = stats.ttest_ind(polars_sort_times, pandas_sort_times, alternative='less')

print("T-Test for Sort DataFrame Times")
print("t-statistic: ", t_stat_sort)
print("p-value: ", p_val_sort)

T-Test for Sort DataFrame Times
t-statistic:  -31.274282775655703
p-value:  5.0149807367894536e-29
