In [10]:
!pip3 install fastavro


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import pandas as pd
from fastavro.schema import parse_schema
from fastavro import writer
import time
import os

In [12]:
def extract_pandas(file_path):
    start_time = time.time()
    df = pd.read_csv(file_path, header=1)
    duration = time.time() - start_time
    return df, duration

In [13]:
def transform_pandas_rename_columns(df):
    start_time = time.time()
    col_names = df.columns
    dict_cols = {col: f"{col}_new" for col in col_names}
    df = df.rename(columns=dict_cols)
    duration = time.time() - start_time
    return duration

def transform_pandas_drop_na(df):
    start_time = time.time()
    df = df.dropna()
    duration = time.time() - start_time
    return duration

In [14]:
# def get_file_size(path):
#   return round(sum(os.path.getsize(f) for f in os.listdir('.') if os.path.isfile(f)) / (1024 * 1024),2)

def get_file_size(path):
    return round(os.path.getsize(path) / (1024 * 1024), 2)

In [15]:
def load_pandas_csv(df, output_path):
    start_time = time.time()
    df.to_csv(output_path, index=False)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size


def load_pandas_parquet(df, output_path):
    start_time = time.time()
    df.to_parquet(output_path, index=False)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_pandas_avro(df, output_path):
    start_time = time.time()

    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    dtype_to_avro = {
        "object": "string",
        "int64": "string",
        "float64": "string",
        "bool": "string"
    }

    for col in df.columns:
        df[col] = df[col].fillna("").astype(str)

    fields = [
        {"name": col, "type": dtype_to_avro[str(dtype)]}
        for col, dtype in zip(df.columns, df.dtypes)
    ]

    schema = {
        "type": "record",
        "name": "AutoGeneratedSchema",
        "fields": fields
    }

    parsed_schema = parse_schema(schema)

    records = df.to_dict(orient="records")

    with open(output_path, "wb") as avro_file:
        writer(avro_file, parsed_schema, records)

    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size

def load_pandas_orc(df, output_path):
    start_time = time.time()
    df.to_orc(output_path, index=False)
    duration = time.time() - start_time

    file_size = get_file_size(output_path)

    return duration, file_size


In [16]:
datasets = ['transactions_data.csv', 'titanic.csv', 'reviews.csv', 'locations.csv']

dim_datasets = []
fact_metrics = []

for index, dataset in enumerate(datasets):
    primary_key = index + 1
    path = dataset.split('.')[0]

    df_raw, extract_time = extract_pandas(dataset)

    transform_rename_columns = transform_pandas_rename_columns(df_raw)
    transform_dropna = transform_pandas_drop_na(df_raw)

    load_time_csv, file_size_csv = load_pandas_csv(df_raw, f"pandas_analysis/{path}.csv")
    load_time_parquet, file_size_parquet = load_pandas_parquet(df_raw, f"pandas_analysis/{path}.parquet")
    load_time_avro, file_size_avro = load_pandas_avro(df_raw, f"pandas_analysis/{path}.avro")
    load_time_orc, file_size_orc = load_pandas_orc(df_raw, f"pandas_analysis/{path}.orc")

    dim_datasets.append({
        "id": primary_key,
        "dataset_name": path,
        "number_of_rows": len(df_raw)
    })

    fact_metrics.append({
        "dataset_id": primary_key,
        "extract_time": round(extract_time,2),
        "transform_rename_columns_time": round(transform_rename_columns,2),
        "transform_dropna_time": round(transform_dropna,2),
        "load_time_csv": round(load_time_csv,2),
        "file_size_csv_mb": file_size_csv,
        "load_time_parquet": round(load_time_parquet,2),
        "file_size_parquet_mb": file_size_parquet,
        "load_time_avro": round(load_time_avro,2),
        "file_size_avro_mb": file_size_avro,
        "load_time_orc": round(load_time_orc,2),
        "file_size_orc_mb": file_size_orc
    })


dim_datasets_df = pd.DataFrame(dim_datasets)
fact_metrics_df = pd.DataFrame(fact_metrics)

dim_datasets_df.to_csv("pandas_analysis/dim_datasets.csv", header=True, mode='w', sep=';', index = False)
fact_metrics_df.to_csv("pandas_analysis/fact_metrics_pandas.csv", header=True, mode='w', sep=';', index = False)

print("Process finished successfully!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Process finished successfully!


In [17]:
df_dim = pd.read_csv("pandas_analysis/dim_datasets.csv", sep=';')
df_dim

Unnamed: 0,id,dataset_name,number_of_rows
0,1,transactions_data,13305914
1,2,titanic,890
2,3,reviews,703795
3,4,locations,843


In [18]:
df_fact = pd.read_csv("pandas_analysis/fact_metrics_pandas.csv", sep=';')
df_fact

Unnamed: 0,dataset_id,extract_time,transform_rename_columns_time,transform_dropna_time,load_time_csv,file_size_csv_mb,load_time_parquet,file_size_parquet_mb,load_time_avro,file_size_avro_mb,load_time_orc,file_size_orc_mb
0,1,35.97,2.04,6.11,98.41,1200.23,19.09,243.48,587.66,1185.7,22.21,942.83
1,2,0.03,0.0,0.0,0.01,0.06,0.02,0.04,0.05,0.06,0.01,0.05
2,3,0.3,0.01,0.05,1.21,14.56,0.25,1.64,6.11,14.58,0.17,7.3
3,4,0.02,0.0,0.0,0.03,0.35,0.03,0.2,0.1,0.35,0.04,0.33
