In [19]:
!pip3 install pyarrow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
!pip3 install fastavro


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
!pip3 install --upgrade pyarrow fastavro


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [22]:
import pyarrow as pa
import pyarrow.csv as csv
import pyarrow.parquet as pq
import pyarrow.orc as orc
from fastavro.schema import parse_schema
from fastavro import writer
import pyarrow.compute as pc
import time
import os

In [23]:
def extract_pyarrow(file_path):
    start_time = time.time()
    table = csv.read_csv(file_path)
    duration = time.time() - start_time
    return table, duration

In [24]:
def transform_pyarrow_rename_columns(table):
    start_time = time.time()
    schema = table.schema
    new_fields = [pa.field(f"{col.name}_new", col.type) for col in schema]
    new_schema = pa.schema(new_fields)
    table = table.rename_columns([col.name for col in new_schema])
    duration = time.time() - start_time
    return table, duration

In [25]:
def get_file_size(path):
    return round(os.path.getsize(path) / (1024 * 1024), 2)

In [26]:
def load_pyarrow_csv(table, output_path):
    start_time = time.time()
    csv.write_csv(table, output_path)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_pyarrow_parquet(table, output_path):
    start_time = time.time()
    pq.write_table(table, output_path)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_pyarrow_orc(table, output_path):
    start_time = time.time()
    orc.write_table(table, output_path)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

In [27]:
datasets = ['transactions_data.csv', 'titanic.csv', 'reviews.csv', 'locations.csv']

dim_datasets = []
fact_metrics = []

for index, dataset in enumerate(datasets):
    primary_key = index + 1
    path = dataset.split('.')[0]

    table_raw, extract_time = extract_pyarrow(dataset)

    table_transformed, transform_rename_columns_time = transform_pyarrow_rename_columns(table_raw)
    
    load_time_csv, file_size_csv = load_pyarrow_csv(table_transformed, f"pyarrow_analysis/{path}.csv")
    load_time_parquet, file_size_parquet = load_pyarrow_parquet(table_transformed, f"pyarrow_analysis/{path}.parquet")
    load_time_orc, file_size_orc = load_pyarrow_orc(table_transformed, f"pyarrow_analysis/{path}.orc")

    dim_datasets.append({
        "id": primary_key,
        "dataset_name": path,
        "number_of_rows": table_raw.num_rows
    })

    fact_metrics.append({
        "dataset_id": primary_key,
        "extract_time": round(extract_time, 2),
        "transform_rename_columns_time": round(transform_rename_columns_time, 2),
        "load_time_csv": round(load_time_csv, 2),
        "file_size_csv_mb": file_size_csv,
        "load_time_parquet": round(load_time_parquet, 2),
        "file_size_parquet_mb": file_size_parquet,
        "load_time_orc": round(load_time_orc, 2),
        "file_size_orc_mb": file_size_orc
    })


dim_datasets_table = pa.Table.from_pylist(dim_datasets)
fact_metrics_table = pa.Table.from_pylist(fact_metrics)

csv.write_csv(dim_datasets_table, "pyarrow_analysis/dim_datasets.csv")
csv.write_csv(fact_metrics_table, "pyarrow_analysis/fact_metrics_pyarrow.csv")

print("Process finished successfully!")

  from pandas.core import (


Process finished successfully!
