## Libraries

In [3]:
import os
import time
import tracemalloc as tm

import polars as pl
import pandas as pd
import duckdb

import plotly.express as px
import matplotlib.pyplot as plt

## Raw Files [Path]

In [4]:
raw_main_data = r'Data_extracted\student_habits_performance.csv'

## Test Functions

In [30]:
def read_files_duckdb(path: str) -> dict:
    global con_duck
    con_duck = duckdb.connect(database = ":memory:")
    con_duck.execute(f'CREATE TABLE datatable AS FROM read_csv_auto("{raw_main_data}")')

    finalTime = None
    df = None
    startTime = time.time()

    df = con_duck.execute("SELECT * FROM datatable")

    endTime = time.time()

    finalTime = endTime - startTime
    
    df = df.fetch_df()

    assert len(df) > 0
    
    return {
            'reading_time': finalTime,
            'data': df
            }

def read_files_pandas(path: str) -> dict:

    finalTime = None
    df = None
    startTime = time.time()

    df = pd.read_csv(path)

    endTime = time.time()
    finalTime = endTime - startTime
    
    assert len(df) > 0

    return {
            'reading_time': finalTime,
            'data': df
            }

def read_files_polars(path: str) -> dict:

    finalTime = None
    df = None
    startTime = time.time()

    df = pl.read_csv(path, ignore_errors=True)

    assert len(df) > 0

    endTime = time.time()
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data': df
            }

def data_populating_duckdb(data, register_multipliyer: int) -> dict:

    finalTime = None
    df_concated = None

    con_duck.register("my_data", data)

    startTime = time.time()

    tm.start()

    union_query = f"""
        REPEAT {register_multipliyer} AS i DO
            COPY FROM DATABASE my_data TO datatable;
        END;
    """

    df_concated = con_duck.execute(union_query)

    current_memory, peak_memory = tm.get_traced_memory()
    tm.stop()

    endTime = time.time()
    
    finalTime = endTime - startTime

    df_concated = df_concated.fetch_df()

    assert len(df_concated) == register_multipliyer*len(data)
    
    return {
            'reading_time': finalTime,
            'data' : df_concated,
            'memory_usage': (current_memory,peak_memory)
            }

def data_populating_pandas(data, register_multipliyer: int) -> dict:

    finalTime = None
    df_concated = None
    startTime = time.time()

    tm.start()

    df_concated = pd.concat([data]* register_multipliyer, ignore_index=True)

    current_memory, peak_memory = tm.get_traced_memory()
    tm.stop()

    endTime = time.time()
    
    finalTime = endTime - startTime

    assert len(df_concated) == register_multipliyer*len(data)
    
    return {
            'reading_time': finalTime,
            'data' : df_concated,
            'memory_usage': (current_memory,peak_memory)
            }

def data_populating_polars(data, register_multipliyer: int) -> dict:

    finalTime = None
    df_concated = None
    startTime = time.time()

    tm.start()

    df_concated = pl.concat([data] * register_multipliyer)

    current_memory, peak_memory = tm.get_traced_memory()
    tm.stop()

    endTime = time.time()
    
    finalTime = endTime - startTime

    assert len(df_concated) == register_multipliyer*len(data)
    
    return {
            'reading_time': finalTime,
            'data' : df_concated,
            'memory_usage': (current_memory,peak_memory)
            }

def data_grouping_duckdb(data, grouping_columns: list, agg_columns: list) -> dict:
    finalTime = None
    agg_columns = [f'SUM({col})' for col in agg_columns]
    startTime = time.time()
      
    df_grouped = con_duck.execute(f"SELECT {','.join(grouping_columns)},{','.join(agg_columns)} FROM datatable GROUP BY {','.join(grouping_columns)}")

    endTime = time.time()

    finalTime = endTime - startTime

    df_grouped = df_grouped.fetch_df()

    return {
            'reading_time': finalTime,
            'data' : df_grouped
            }

def data_grouping_pandas(data, grouping_columns: list, agg_columns: list) -> dict:
    finalTime = None
    df_concated = None
    startTime = time.time()

    df_grouped = data.groupby(by=grouping_columns)[agg_columns].sum()

    endTime = time.time()
    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_grouped
            }

def data_grouping_polars(data, grouping_columns: list, agg_columns: list) -> dict:

    finalTime = None
    df_concated = None
    startTime = time.time()

        
    df_grouped = data.group_by(grouping_columns).agg(pl.col(agg_columns).sum())
    
    endTime = time.time()
    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_grouped
            }

def data_filtering_duckdb(data, filter_column: list) -> dict:
    finalTime = None
    df_filtered = None
    startTime = time.time()

    df_filtered = con_duck.execute(f"SELECT * FROM datatable WHERE {filter_column} > (SELECT AVG({filter_column}) FROM datatable)")

    endTime = time.time()

    finalTime = endTime - startTime

    df_filtered.fetch_df()

    return {
            'reading_time': finalTime,
            'data' : df_filtered
            }

def data_filtering_pandas(data, filter_column: list) -> dict:

    finalTime = None
    df_filtered = None
    startTime = time.time()

    df_filtered = data[data[filter_column] > data[filter_column].mean()]

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_filtered
            }

def data_filtering_polars(data, filter_column: list) -> dict:

    finalTime = None
    df_filtered = None
    startTime = time.time()

    df_filtered = data.filter(data[filter_column] > data[filter_column].mean())

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_filtered
            }

def column_sorting_duckdb(data, sort_columns: list):
    finalTime = None
    df_sorted = None
    startTime = time.time()
      
    df_sorted = con_duck.execute(f"SELECT * FROM datatable ORDER BY {','.join(sort_columns)}")

    endTime = time.time()

    
    finalTime = endTime - startTime

    df_sorted.fetch_df()
    
    return {
            'reading_time': finalTime,
            'data' : df_sorted
            }

def column_sorting_pandas(data, sort_columns: list):
    finalTime = None
    df_sorted = None
    startTime = time.time()

    df_sorted = data.sort_values(sort_columns)

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_sorted
            }
    
def column_sorting_polars(data, sort_columns: list):
    finalTime = None
    df_sorted = None
    startTime = time.time()

    df_sorted = data.sort(sort_columns)

    endTime = time.time()

    
    finalTime = endTime - startTime

    return {
            'reading_time': finalTime,
            'data' : df_sorted
            }

def format_to_parquet_duckdb(data, saving_path: str):
    finalTime = None
    startTime = time.time()

    con_duck.execute(
        f"""
        COPY (
            SELECT * FROM datatable
        )
        TO '{saving_path}_duckdb.parquet' (FORMAT 'parquet');
    """)

    endTime = time.time()

    finalTime = endTime - startTime

    return {
            'reading_time': finalTime
            }

def format_to_parquet_pandas(data, saving_path: str):
    finalTime = None
    startTime = time.time()


    data.to_parquet(f'{saving_path}_polars.parquet')

    endTime = time.time()

    finalTime = endTime - startTime

    return {
            'reading_time': finalTime
            }

def format_to_parquet_polars(data, saving_path: str):
    finalTime = None
    startTime = time.time()

    data.write_parquet(f'{saving_path}_polars.parquet')

    endTime = time.time()

    finalTime = endTime - startTime

    return {
            'reading_time': finalTime
            }

## Reading Speed

In [25]:
reading_results = {
    'duckdb':read_files_duckdb(raw_main_data),
    'pandas':read_files_pandas(raw_main_data),
    'polar':read_files_polars(raw_main_data)
}

graph_results = {}
for reader in reading_results.keys():
    graph_results[reader] = reading_results[reader]["reading_time"]
    print(f'{reader}: {reading_results[reader]["reading_time"]}') 

duckdb: 0.0010006427764892578
pandas: 0.002245187759399414
polar: 0.0010066032409667969


In [26]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Reading Speed')


## Register Increment Speed

In [31]:
increment_results = {
    'duckdb':data_populating_duckdb(data = reading_results['duckdb']['data'], register_multipliyer= 6000),
    'pandas':data_populating_pandas(data = reading_results['pandas']['data'], register_multipliyer= 6000),
    'polar':data_populating_polars(data = reading_results['polars']['data'], register_multipliyer= 6000)
}
list_memory_usage = None

graph_results = {}
for reader in reading_results.keys():
    graph_results[reader] = increment_results[reader]["reading_time"]
    print(f'{reader}: {increment_results[reader]["reading_time"]}') 
    list_memory_usage[reader] = {'current_memory': increment_results[reader]["memory_usage"][0], 'peak_memory': increment_results[reader]["memory_usage"][1]}

ParserException: Parser Error: syntax error at or near "REPEAT"

In [27]:
fig = px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Data Population Speed')

fig.update_yaxes(type='log')

fig.show()

## Data Grouping Speed

In [None]:
grouping_columns = ['age','diet_quality']
agg_columns = ['study_hours_per_day','social_media_hours']

grouping_results = {}

for  in s:
    grouping_results[] = data_grouping(data = increment_results[]["data"],  = , grouping_columns = grouping_columns, agg_columns = agg_columns)

graph_results = {}
for  in s:
    graph_results[] = grouping_results[]["reading_time"]
    print(f'{}: {grouping_results[]["reading_time"]}') 
    print(f'{}: {len(grouping_results[]["data"])}') 

polars: 0.13136792182922363
polars: 24
pandas: 0.03264284133911133
pandas: 24
duckDB: 0.0060460567474365234
duckDB: 24


In [29]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Data Grouping Speed')

## Column Sorting Speed

In [None]:
sort_columns = ['age','study_hours_per_day']

column_sorting_results = {}

for  in s:
    column_sorting_results[] = column_sorting(data = increment_results[]["data"],  = , sort_columns = sort_columns)

graph_results = {}
for  in s:
    graph_results[] = column_sorting_results[]["reading_time"]
    print(f'{}: {column_sorting_results[]["reading_time"]}') 
    print(f'{}: {len(column_sorting_results[]["data"])}') 

polars: 0.061162710189819336
polars: 600000
pandas: 0.07922077178955078
pandas: 600000
duckDB: 0.0015959739685058594
duckDB: 1000


In [31]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Sorting Speed')

## Data Filtering Speed

For the filtering column selection it needs to full the following requirements:
- Be a *numeric* column
- Uses a single column to make a single condition filter

And the internal parameters for it to filter are:
- It will only check for values greater than the mean of the column

In [None]:
column_to_filter = 'age'

filter_columns_results = {}

for  in s:
    filter_columns_results[] = data_filtering(data = increment_results[]["data"],  = , filter_column = column_to_filter)

graph_results = {}
for  in s:
    graph_results[] = filter_columns_results[]["reading_time"]
    print(f'{}: {filter_columns_results[]["reading_time"]}') 
    print(f'{}: {len(filter_columns_results[]["data"])}') 

polars: 0.01764988899230957
polars: 294600
pandas: 0.0246579647064209
pandas: 294600
duckDB: 0.0021669864654541016
duckDB: 491


In [33]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Filtering Speed')

## Memory Usage

In [34]:
data_for_plot = []
for lib, mems in list_memory_usage.items():
    for mem_type, value in mems.items():
        data_for_plot.append({
            'library': lib,
            'memory_type': 'Current' if mem_type == 'current_memory' else 'Peak',
            'memory': value
        })

In [35]:
fig = px.bar(
    data_for_plot,
    x='library',
    y='memory',
    color='memory_type',
    barmode='group',
    labels={'memory': 'Memory Usage (bytes)', 'library': 'Library', 'memory_type': 'Memory Type'},
    title='Memory Usage by Library'
)
fig.update_yaxes(type='log')

fig.show()

## Data Offload Speed

In [None]:
path = './Data_Offload/data'
data_offload_results = {}

for  in s:
    data_offload_results[] = format_to_parquet(data = increment_results[]["data"],  = , saving_path = path)

graph_results = {}
for  in s:
    graph_results[] = data_offload_results[]["reading_time"]
    print(f'{}: {data_offload_results[]["reading_time"]}') 

polars: 0.10742354393005371
pandas: 0.42351818084716797
duckDB: 0.007433652877807617


In [37]:
px.bar(x=list(graph_results.keys()),y=list(graph_results.values()), labels={'x':'Library', 'y':'Time (s)'}, title='Data Offload Speed')