## Libraries

In [15]:
import os
import time
import tracemalloc as tm

import polars as pl
import pandas as pd
import duckdb

import plotly.express as px
import matplotlib.pyplot as plt

## Raw Files [Path]

In [16]:
raw_main_data = r'Data_extracted\student_habits_performance.csv'

## Test Functions

In [None]:
def read_files(path: str, reader: str) -> dict:
    finalTime = None
    df = None
    startTime = time.time()
    reader = reader.lower()

    if reader  == 'duckdb':

        global con_duck
        con_duck = duckdb.connect(database = ":memory:")
        con_duck.execute(f'CREATE TABLE datatable AS FROM read_csv_auto("{raw_main_data}")')
        df = con_duck.execute("SELECT * FROM datatable").fetchdf()

    elif reader  == 'pandas':
        df = pd.read_csv(path)
        
    elif reader  == 'polars':
        df = pl.read_csv(path, ignore_errors=True)

    endTime = time.time()
    finalTime = endTime - startTime
    assert len(df) > 0
    
    return {
            'reader':reader,
            'reading_time': finalTime,
            'data': df
            }

def data_populating(df, reader: str, register_multipliyer: int) -> dict:
    finalTime = None
    df_concated = None
    startTime = time.time()
    reader = reader.lower()

    if reader == 'duckdb':
        con_duck.register("my_data", df)

        union_query = " UNION ALL ".join(["SELECT * FROM my_data"] * register_multipliyer)

        df_concated = con_duck.execute(union_query).fetchdf()

    elif reader == 'pandas':
        df_concated = pd.concat([df]* register_multipliyer, ignore_index=True)

    elif reader == 'polars':

        df_concated = pl.concat([df] * register_multipliyer)

    endTime = time.time()
    
    finalTime = endTime - startTime

    assert len(df_concated) == register_multipliyer*len(df)
    
    return {
            'reader':reader,
            'reading_time': finalTime,
            'data' : df_concated
            }

def data_grouping(data, reader: str, grouping_columns: list, agg_columns: list) -> dict:
    finalTime = None
    df_concated = None
    agg_columns = [f'SUM({col})' for col in agg_columns] if reader.lower() == 'duckdb' else agg_columns
    startTime = time.time()
    reader = reader.lower()

    if reader == 'duckdb':            
        df_grouped = con_duck.execute(f"SELECT {','.join(grouping_columns)},{','.join(agg_columns)} FROM datatable GROUP BY {','.join(grouping_columns)}").fetchdf()

    elif reader == 'pandas':
        df_grouped = data.groupby(by=grouping_columns)[agg_columns].sum()

    elif reader == 'polars':

        agg_exprs = [pl.col(col).sum() for col in agg_columns]
        
        df_grouped = data.group_by(grouping_columns).agg(agg_exprs)

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reader':reader,
            'reading_time': finalTime,
            'data' : df_grouped
            }

def data_filtering(data, reader: str, filter_column: list) -> dict:
    finalTime = None
    df_filtered = None
    startTime = time.time()
    reader = reader.lower()

    if reader == 'duckdb':  

        df_filtered = con_duck.execute(f"SELECT * FROM datatable WHERE {filter_column} > (SELECT AVG({filter_column}) FROM datatable)").fetchdf()

    elif reader == 'pandas':

        df_filtered = data[data[filter_column] > data[filter_column].mean()]

    elif reader == 'polars':
        
        df_filtered = data.filter(data[filter_column] > data[filter_column].mean())

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reader':reader,
            'reading_time': finalTime,
            'data' : df_filtered
            }

def column_sorting(data, reader: str, sort_columns: list):
    finalTime = None
    df_sorted = None
    startTime = time.time()
    reader = reader.lower()

    if reader == 'duckdb':            
        df_sorted = con_duck.execute(f"SELECT * FROM datatable ORDER BY {','.join(sort_columns)}").fetchdf()

    elif reader == 'pandas':
        df_sorted = data.sort_values(sort_columns)

    elif reader == 'polars':
        df_sorted = data.sort(sort_columns)
        

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reader':reader,
            'reading_time': finalTime,
            'data' : df_sorted
            }

def memory_usage():

    return

def format_to_parquet(df, reader: str, saving_path: str):
    finalTime = None
    startTime = time.time()
    reader = reader.lower()

    if 'duckdb':
        df.to_parquet()
    elif 'pandas':
        df.to_arrow(f'{saving_path}_polars.parquet')
    elif 'polars':
        df.write_parquet(f'{saving_path}_polars.parquet')

    endTime = time.time()

    finalTime = endTime - startTime

    return {
            'reader':reader,
            'reading_time': finalTime,
            }

## Reading Speed

In [18]:
readers = ['polars','pandas','duckDB']
reading_results = {}

for reader in readers:
    reading_results[reader] = read_files(path = raw_main_data, reader = reader)

graph_results = {}
for reader in readers:
    graph_results[reader] = reading_results[reader]["reading_time"]
    print(f'{reader}: {reading_results[reader]["reading_time"]}') 

polars: 0.002010345458984375
pandas: 0.004754543304443359
duckDB: 0.034986257553100586


In [19]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()))


## Register Increment Speed

In [20]:
increment_results = {}
list_memory_usage = {}

for reader in readers:
    tm.start()
    increment_results[reader] = data_populating(df = reading_results[reader]["data"], reader = reader, register_multipliyer = 600)
    current_memory, peak_memory = tm.get_traced_memory()
    tm.stop()
    list_memory_usage[reader] = {'current_memory': current_memory, 'peak_memory': peak_memory}


graph_results = {}
for reader in readers:
    graph_results[reader] = increment_results[reader]["reading_time"]
    print(f'{reader}: {increment_results[reader]["reading_time"]}') 
    print(f'{reader}: {len(increment_results[reader]["data"])}') 


polars: 0.05161285400390625
polars: 600000
pandas: 0.2831759452819824
pandas: 600000
duckDB: 10.751887798309326
duckDB: 600000


In [21]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()))

## Data Grouping Speed

In [22]:
grouping_columns = ['age','diet_quality']
agg_columns = ['study_hours_per_day','social_media_hours']

grouping_results = {}

for reader in readers:
    grouping_results[reader] = data_grouping(data = increment_results[reader]["data"], reader = reader, grouping_columns = grouping_columns, agg_columns = agg_columns)

graph_results = {}
for reader in readers:
    graph_results[reader] = grouping_results[reader]["reading_time"]
    print(f'{reader}: {grouping_results[reader]["reading_time"]}') 
    print(f'{reader}: {len(grouping_results[reader]["data"])}') 

polars: 0.16416430473327637
polars: 24
pandas: 0.048146963119506836
pandas: 24
duckDB: 0.012113571166992188
duckDB: 24


In [23]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()))

## Column Sorting Speed

In [24]:
sort_columns = ['age','study_hours_per_day']

column_sorting_results = {}

for reader in readers:
    column_sorting_results[reader] = column_sorting(data = increment_results[reader]["data"], reader = reader, sort_columns = sort_columns)

graph_results = {}
for reader in readers:
    graph_results[reader] = column_sorting_results[reader]["reading_time"]
    print(f'{reader}: {column_sorting_results[reader]["reading_time"]}') 
    print(f'{reader}: {len(column_sorting_results[reader]["data"])}') 

polars: 0.07132577896118164
polars: 600000
pandas: 0.10311269760131836
pandas: 600000
duckDB: 0.0058934688568115234
duckDB: 1000


In [25]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()))

## Data Filtering Speed

For the filtering column selection it needs to full the following requirements:
- Be a *numeric* column
- Uses a single column to make a single condition filter

And the internal parameters for it to filter are:
- It will only check for values greater than the mean of the column

In [27]:
column_to_filter = 'age'

filter_columns_results = {}

for reader in readers:
    filter_columns_results[reader] = data_filtering(data = increment_results[reader]["data"], reader = reader, filter_column = column_to_filter)

graph_results = {}
for reader in readers:
    graph_results[reader] = filter_columns_results[reader]["reading_time"]
    print(f'{reader}: {filter_columns_results[reader]["reading_time"]}') 
    print(f'{reader}: {len(filter_columns_results[reader]["data"])}') 

polars: 0.040396928787231445
polars: 294600
pandas: 0.03633403778076172
pandas: 294600
duckDB: 0.005646944046020508


TypeError: object of type 'NoneType' has no len()

## Memory Usage

In [None]:
data_for_plot = []
for lib, mems in list_memory_usage.items():
    for mem_type, value in mems.items():
        data_for_plot.append({
            'library': lib,
            'memory_type': 'Current' if mem_type == 'current_memory' else 'Peak',
            'memory': value
        })

In [None]:
fig = px.bar(
    data_for_plot,
    x='library',
    y='memory',
    color='memory_type',
    barmode='group',
    labels={'memory': 'Memory Usage (bytes)', 'library': 'Library', 'memory_type': 'Memory Type'},
    title='Memory Usage by Library'
)
fig.update_yaxes(type='log')

fig.show()

## Data Offload Speed

In [None]:
path = 'Data_Offload'
data_offload_results = {}

for reader in readers:
    data_offload_results[reader] = format_to_parquet(data = increment_results[reader]["data"], reader = reader, saving_path = path)

graph_results = {}
for reader in readers:
    graph_results[reader] = data_offload_results[reader]["reading_time"]
    print(f'{reader}: {data_offload_results[reader]["reading_time"]}') 
    print(f'{reader}: {len(data_offload_results[reader]["data"])}') 