In [41]:
!pip3 install polars


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [42]:
!pip3 install fastavro


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [43]:
import polars as pl
from fastavro.schema import parse_schema
from fastavro import writer
import time
import os

In [44]:
def extract_polars(file_path):
    start_time = time.time()
    df = pl.read_csv(file_path, has_header=True)
    duration = time.time() - start_time
    return df, duration

In [45]:
def transform_polars_rename_columns(df):
    start_time = time.time()
    col_names = df.columns
    dict_cols = {col: f"{col}_new" for col in col_names}
    df = df.rename(dict_cols)
    duration = time.time() - start_time
    return duration

def transform_polars_drop_na(df):
    start_time = time.time()
    df = df.drop_nulls()
    duration = time.time() - start_time
    return duration

In [46]:
def get_file_size(path):
    return round(os.path.getsize(path) / (1024 * 1024), 2)

In [47]:
def load_polars_csv(df, output_path):
    start_time = time.time()
    df.write_csv(output_path)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_polars_parquet(df, output_path):
    start_time = time.time()
    df.write_parquet(output_path)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_polars_avro(df, output_path):
    start_time = time.time()

    dtype_to_avro = {
        pl.Utf8: "string",
        pl.Int64: "long",
        pl.Float64: "double",
        pl.Boolean: "boolean",
    }

    fields = []
    for col, dtype in zip(df.columns, df.dtypes):
        avro_type = dtype_to_avro.get(dtype, "string")
        field_type = ["null", avro_type]  # Allow nulls
        fields.append({"name": col, "type": field_type})

    schema = {
        "type": "record",
        "name": "AutoGeneratedSchema",
        "fields": fields
    }
    parsed_schema = parse_schema(schema)

    records = []
    for record in df.to_dicts():
        records.append({k: (v if v is not None else None) for k, v in record.items()})

    with open(output_path, "wb") as avro_file:
        writer(avro_file, parsed_schema, records)

    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

In [48]:
datasets = ['transactions_data.csv', 'titanic.csv', 'reviews.csv', 'locations.csv']

dim_datasets = []
fact_metrics = []

for index, dataset in enumerate(datasets):
    primary_key = index + 1
    path = dataset.split('.')[0]

    df_raw, extract_time = extract_polars(dataset)

    transform_rename_columns = transform_polars_rename_columns(df_raw)
    transform_dropna = transform_polars_drop_na(df_raw)

    load_time_csv, file_size_csv = load_polars_csv(df_raw, f"polars_analysis/{path}.csv")
    load_time_parquet, file_size_parquet = load_polars_parquet(df_raw, f"polars_analysis/{path}.parquet")
    load_time_avro, file_size_avro = load_polars_avro(df_raw, f"polars_analysis/{path}.avro")

    dim_datasets.append({
        "id": primary_key,
        "dataset_name": path,
        "number_of_rows": len(df_raw)
    })

    fact_metrics.append({
        "dataset_id": primary_key,
        "extract_time": round(extract_time, 2),
        "transform_rename_columns_time": round(transform_rename_columns, 2),
        "transform_dropna_time": round(transform_dropna, 2),
        "load_time_csv": round(load_time_csv, 2),
        "file_size_csv_mb": file_size_csv,
        "load_time_parquet": round(load_time_parquet, 2),
        "file_size_parquet_mb": file_size_parquet,
        "load_time_avro": round(load_time_avro, 2),
        "file_size_avro_mb": file_size_avro
    })


dim_datasets_df = pl.DataFrame(dim_datasets)
fact_metrics_df = pl.DataFrame(fact_metrics)

dim_datasets_df.write_csv("polars_analysis/dim_datasets.csv", separator=';')
fact_metrics_df.write_csv("polars_analysis/fact_metrics_polars.csv", separator=';')

print("Process finished successfully!")

Process finished successfully!


In [49]:
df_dim = pl.read_csv("polars_analysis/dim_datasets.csv", separator=';')
print(df_dim)

shape: (4, 3)
┌─────┬───────────────────┬────────────────┐
│ id  ┆ dataset_name      ┆ number_of_rows │
│ --- ┆ ---               ┆ ---            │
│ i64 ┆ str               ┆ i64            │
╞═════╪═══════════════════╪════════════════╡
│ 1   ┆ transactions_data ┆ 13305915       │
│ 2   ┆ titanic           ┆ 891            │
│ 3   ┆ reviews           ┆ 703796         │
│ 4   ┆ locations         ┆ 844            │
└─────┴───────────────────┴────────────────┘


In [50]:
df_fact = pl.read_csv("polars_analysis/fact_metrics_polars.csv", separator=';')
print(df_fact)

shape: (4, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ dataset_i ┆ extract_t ┆ transform ┆ transform ┆ … ┆ load_time ┆ file_size ┆ load_time ┆ file_siz │
│ d         ┆ ime       ┆ _rename_c ┆ _dropna_t ┆   ┆ _parquet  ┆ _parquet_ ┆ _avro     ┆ e_avro_m │
│ ---       ┆ ---       ┆ olumns_ti ┆ ime       ┆   ┆ ---       ┆ mb        ┆ ---       ┆ b        │
│ i64       ┆ f64       ┆ me        ┆ ---       ┆   ┆ f64       ┆ ---       ┆ f64       ┆ ---      │
│           ┆           ┆ ---       ┆ f64       ┆   ┆           ┆ f64       ┆           ┆ f64      │
│           ┆           ┆ f64       ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1         ┆ 12.11     ┆ 0.01      ┆ 1.2       ┆ … ┆ 5.49      ┆ 191.7     ┆ 775.61    ┆ 1133.18  │
│ 2         ┆ 0.05      ┆ 0.0       ┆ 0.06      ┆ … ┆ 0.01      ┆ 0.03      