# 1. Unzip The Raw Files that are found in 'raw-data' into 'Unzipped Rebrickable'

In [0]:
# # Azure storage account details
account_name = "***********"
account_key = "******************"

In [0]:
from pyspark.sql import SparkSession
from datetime import datetime
import gzip
import io
from pyspark.sql.functions import col
import pandas as pd

spark.conf.set(f"fs.azure.account.key.{account_name}.dfs.core.windows.net", account_key)

# Set today's date dynamically
today = datetime.today()
year = today.strftime('%Y')
month = today.strftime('%m')
day = today.strftime('%d')

spark = SparkSession.builder \
    .appName("ADLS Integration") \
    .getOrCreate()

In [0]:
def read_and_unzip_csv(file_path):
    """
    Function to read and unzip a gzip-compressed CSV file from that are found in raw-data/Rebricable/... and saved into raw-data/Unzipped Rebrickable/...
    """
    compressed_data = spark.read.format("binaryFile").load(file_path).collect()[0].content
    
    with gzip.GzipFile(fileobj=io.BytesIO(compressed_data)) as gz:
        uncompressed_data = gz.read().decode('utf-8')  

    data = io.StringIO(uncompressed_data)
    df = pd.read_csv(data)  

    spark_df = spark.createDataFrame(df)
    return spark_df

In [0]:

dataset_names = ["themes", "colors", "part_categories", "parts", "part_relationships", "elements", "sets", 
                 "minifigs", "inventories", "inventory_parts", "inventory_sets", "inventory_minifigs", "elements"]
                 
# Loop through each dataset and read the unzipped data and rename it
for dataset in dataset_names:
    # Define the file path for the compressed CSV file
    file_path = f"abfss://raw-data@{account_name}.dfs.core.windows.net/Rebrickable/Lego/{dataset}/Year={year}/Month={month}/Day={day}/{dataset}.csv.gz"
    
    # Define the target directory for saving unzipped CSV files
    target_dir = f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/{dataset}/Year={year}/Month={month}/Day={day}"
    
    # Define the final output path for the renamed file
    final_output_path = f"{target_dir}/{dataset}.csv"

    print(f"Attempting to read: {file_path}")

    try:
        df = read_and_unzip_csv(file_path)

        # Save the DataFrame as a CSV file to a temporary directory
        temp_dir = f"{target_dir}/temp"
        df.coalesce(1).write.mode("overwrite").csv(temp_dir, header=True)
        
        files_in_temp = dbutils.fs.ls(temp_dir)
        
        # Find the part file in the temporary directory and rename it
        part_file = [f.path for f in files_in_temp if f.name.startswith("part-")][0]
        dbutils.fs.mv(part_file, final_output_path)
        
        dbutils.fs.rm(temp_dir, True)

        print(f"Successfully saved {dataset} DataFrame as {final_output_path}")

    except Exception as e:
        print(f"Error reading or saving {file_path}: {e}")


Attempting to read: abfss://raw-data@atomicatraining.dfs.core.windows.net/Rebrickable/Lego/themes/Year=2024/Month=11/Day=08/themes.csv.gz
Successfully saved themes DataFrame as abfss://raw-data@atomicatraining.dfs.core.windows.net/Unzipped Rebrickable/Lego/themes/Year=2024/Month=11/Day=08/themes.csv
Attempting to read: abfss://raw-data@atomicatraining.dfs.core.windows.net/Rebrickable/Lego/colors/Year=2024/Month=11/Day=08/colors.csv.gz
Successfully saved colors DataFrame as abfss://raw-data@atomicatraining.dfs.core.windows.net/Unzipped Rebrickable/Lego/colors/Year=2024/Month=11/Day=08/colors.csv
Attempting to read: abfss://raw-data@atomicatraining.dfs.core.windows.net/Rebrickable/Lego/part_categories/Year=2024/Month=11/Day=08/part_categories.csv.gz
Successfully saved part_categories DataFrame as abfss://raw-data@atomicatraining.dfs.core.windows.net/Unzipped Rebrickable/Lego/part_categories/Year=2024/Month=11/Day=08/part_categories.csv
Attempting to read: abfss://raw-data@atomicatraining