# Demo Apache Arrow

In [1]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.csv as csv  
import pyarrow.compute as pc 
from io import StringIO 
import time
import sys
import random

## Cargar desde csv 

In [2]:
DATA_DIR = "../data"
CSV_FILE = os.path.join(DATA_DIR, "learning_traces.csv")
ARROW_FILE = os.path.join(DATA_DIR, "learning_traces.arrow") 

### En Pandas

In [3]:
start_time = time.time()

In [4]:
df_pandas = pd.read_csv(
    CSV_FILE, 
    dtype={'user_id': 'category', 'learning_language': 'category', 'ui_language': 'category'}
)

In [5]:
pandas_load_time = time.time() - start_time
print(f"Datos cargados en Pandas ({len(df_pandas)} filas). Tiempo: {pandas_load_time:.4f}s")

Datos cargados en Pandas (12854226 filas). Tiempo: 6.1576s


### En Arrow

In [6]:
start_time = time.time()

In [7]:
tabla_arrow = csv.read_csv(CSV_FILE)

In [8]:
arrow_load_time = time.time() - start_time
print(f"Datos cargados en Arrow ({tabla_arrow.num_rows} filas). Tiempo: {arrow_load_time:.4f}s")


Datos cargados en Arrow (12854226 filas). Tiempo: 0.8691s


#### Schema de los datos

In [9]:
print(tabla_arrow.schema)

p_recall: double
timestamp: int64
delta: int64
user_id: string
learning_language: string
ui_language: string
lexeme_id: string
lexeme_string: string
history_seen: int64
history_correct: int64
session_seen: int64
session_correct: int64


#### Primeros 5 registros

In [10]:
print(tabla_arrow.slice(0, 5).to_pandas())

   p_recall   timestamp     delta user_id learning_language ui_language  \
0       1.0  1362076081  27649635    u:FO                de          en   
1       0.5  1362076081  27649635    u:FO                de          en   
2       1.0  1362076081  27649635    u:FO                de          en   
3       0.5  1362076081  27649635    u:FO                de          en   
4       1.0  1362076081  27649635    u:FO                de          en   

                          lexeme_id                     lexeme_string  \
0  76390c1350a8dac31186187e2fe1e178  lernt/lernen<vblex><pri><p3><sg>   
1  7dfd7086f3671685e2cf1c1da72796d7     die/die<det><def><f><sg><nom>   
2  35a54c25a2cda8127343f6a82e6f6b7d          mann/mann<n><m><sg><nom>   
3  0cf63ffe3dda158bc3dbd55682b355ae          frau/frau<n><f><sg><nom>   
4  84920990d78044db53c1b012f5bf9ab5    das/das<det><def><nt><sg><nom>   

   history_seen  history_correct  session_seen  session_correct  
0             6                4            

## Consumo de Memoria

In [11]:
def get_mem_size(obj):
    if isinstance(obj, pd.DataFrame):
        return obj.memory_usage(deep=True).sum() / (1024 * 1024)
    elif isinstance(obj, pa.Table):
        return obj.nbytes / (1024 * 1024)
    return 0

In [12]:
pandas_size = get_mem_size(df_pandas)
arrow_size = get_mem_size(tabla_arrow)

In [13]:
print(f"Tamaño Pandas (Filas): {pandas_size:.2f} MB")
print(f"Tamaño Arrow (Columnas): {arrow_size:.2f} MB")
print(f"Ratio de Compresión Lógica (Pandas/Arrow): {pandas_size / arrow_size:.2f}x")

Tamaño Pandas (Filas): 2872.69 MB
Tamaño Arrow (Columnas): 1738.39 MB
Ratio de Compresión Lógica (Pandas/Arrow): 1.65x


## Consultas

In [14]:
NUM_ITERACIONES = 50

### Arrow

In [15]:
start_arrow = time.time()
for _ in range(NUM_ITERACIONES):
    _ = tabla_arrow.group_by('learning_language').aggregate([('p_recall', 'mean')])
arrow_query_time = (time.time() - start_arrow) / NUM_ITERACIONES

In [16]:
print(f"  Tiempo promedio Arrow (Columnas): {arrow_query_time:.6f} segundos/ejecución")

  Tiempo promedio Arrow (Columnas): 0.021147 segundos/ejecución


### Pandas

In [17]:
start_pandas = time.time()
for _ in range(NUM_ITERACIONES):
    _ = df_pandas.groupby('learning_language')['p_recall'].mean()
pandas_query_time = (time.time() - start_pandas) / NUM_ITERACIONES

  _ = df_pandas.groupby('learning_language')['p_recall'].mean()


In [18]:
print(f"  Tiempo promedio Pandas (Filas): {pandas_query_time:.6f} segundos/ejecución")

  Tiempo promedio Pandas (Filas): 0.081699 segundos/ejecución


In [19]:
print(f"  Ratio de Rendimiento (Pandas/Arrow): {pandas_query_time / arrow_query_time:.2f}x")

  Ratio de Rendimiento (Pandas/Arrow): 3.86x
