# Init

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
%run ./utils

In [0]:
BASE_ABFSS_PATH = "abfss://backblaze@diplomadata.dfs.core.windows.net/data/backblaze_drive"
BASE_DBFS_PATH = "/dbfs/mnt/backblaze"
MAX_EXECUTION_COUNT = 1

In [0]:
cluster_info_df = get_cluster_info()
display(cluster_info_df)

# Benchmarks

# Polars

In [0]:
# dbutils.fs.cp(
#     "abfss://backblaze@diplomadata.dfs.core.windows.net/data/backblaze_drive/2023/Q4",
#     "dbfs:/mnt/backblaze/2023/Q4",
#     recurse=True
# )

In [0]:
datasets = [
  {
    "data_path": f"{BASE_DBFS_PATH}/2023/Q1/2023-01-1*.csv",
    "data_folders": [
      f"{BASE_DBFS_PATH}/2023/Q1/2023-01-11.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-02-12.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-03-13.csv"
    ]
  },
  {
    "data_path": f"{BASE_DBFS_PATH}/2023/Q1/2023-01-*.csv",
    "data_folders": [
      f"{BASE_DBFS_PATH}/2023/Q1/2023-01-0*.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-02-1*.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-03-2*.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-03-3*.csv"
    ]
  },
  {
    "data_path": f"{BASE_DBFS_PATH}/2023/Q1/",
    "data_folders": [
      f"{BASE_DBFS_PATH}/2023/Q1/2023-01-*.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-02-*.csv",
      f"{BASE_DBFS_PATH}/2023/Q1/2023-03-*.csv"
    ]
  },
  {
    "data_path": f"{BASE_DBFS_PATH}/2023/",
    "data_folders": [
      f"{BASE_DBFS_PATH}/2023/Q1/",
      f"{BASE_DBFS_PATH}/2023/Q2/",
      f"{BASE_DBFS_PATH}/2023/Q3/",
      f"{BASE_DBFS_PATH}/2023/Q4/",
    ]
  }
]

In [0]:
%run ./polars_actions

In [0]:
pb = PolarsBenchmark()

In [0]:
result_df = []
for dataset in datasets:
    result = []
    data_path = dataset['data_path']
    data_folders = dataset['data_folders']
    print(f"Processing dataset {data_path}")

    result.extend(run_benchmark("group_by", pb.group_by_model, data_path))
    result.extend(run_benchmark("join", pb.join_adjacent_days, data_folders))
    result.extend(run_benchmark("union", pb.union_and_aggregate, data_folders))
    print(result)
    print("Data processing")
    data_info_df = get_data_info(data_path.replace(BASE_DBFS_PATH, BASE_ABFSS_PATH))
    print("Result processing")
    res_df = (
        cluster_info_df
        .join(data_info_df)
        .join(spark.createDataFrame(result, ["task_name", "duration", "succcess"]))
    )
    result_df.append(res_df)
    display(res_df)

In [0]:
display(
    spark.createDataFrame(
        [('join', 1.11, True), ('join', 1.01, True), ('join', 0.1, True)],
        ["task_name", "duration", "succcess"]
    )
)

# Spark

In [0]:
%run ./spark_actions

In [0]:
sb = SparkBenchmark(spark)

In [0]:
datasets = [
  {
    "data_path": f"{BASE_ABFSS_PATH}/2023/Q1/2023-01-1*.csv",
    "data_folders": [
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-01-11.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-02-12.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-03-13.csv"
    ]
  },
  {
    "data_path": f"{BASE_ABFSS_PATH}/2023/Q1/2023-01-*.csv",
    "data_folders": [
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-01-0*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-02-1*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-03-2*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-03-3*.csv"
    ]
  },
  {
    "data_path": f"{BASE_ABFSS_PATH}/2023/Q1/*.csv",
    "data_folders": [
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-01-*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-02-*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q1/2023-03-*.csv"
    ]
  },
    {
    "data_path": f"{BASE_ABFSS_PATH}/2023/*/*.csv",
    "data_folders": [
      f"{BASE_ABFSS_PATH}/2023/Q1/*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q2/*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q3/*.csv",
      f"{BASE_ABFSS_PATH}/2023/Q4/*.csv",
    ]
  }
]

In [0]:
result_df = []
for dataset in datasets:
    result = []

    data_path = dataset['data_path']
    data_folders = dataset['data_folders']
    print(f"Processing dataset {data_path}")

    result.extend(run_benchmark("group_by", sb.group_by_model, data_path))
    result.extend(run_benchmark("join", sb.join_adjacent_days, data_folders))
    result.extend(run_benchmark("union", sb.union_and_aggregate, data_folders))

    # print("Data processing")
    # data_info_df = get_data_info(data_path)
    print("Result processing")
    res_df = spark.createDataFrame(result, ["task_name", "duration", "success"])
    result_df.append(res_df)
    display(res_df)