In [0]:
# Check if the mount point exists
mount_point = "/mnt/ecomdata1"

if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    print(f"Mount point {mount_point} already exists.")
else:
    # Perform mounting only if it doesn't exist
    configs = {
        "fs.azure.account.auth.type": "OAuth",
        "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
        "fs.azure.account.oauth2.client.id": "your-client-id",
        "fs.azure.account.oauth2.client.secret": "your-client-secret",
        "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/your-tenant-id/oauth2/token"
    }

    dbutils.fs.mount(
        source="abfss://landingzone2@ecommerceversion007.dfs.core.windows.net/",
        mount_point=mount_point,
        extra_configs=configs
    )
    print("Storage mounted successfully.")


Mount point /mnt/ecomdata1 already exists.


In [0]:
%fs ls 'mnt/ecomdata1/'

path,name,size,modificationTime
dbfs:/mnt/ecomdata1/buyers_raw_2/,buyers_raw_2/,0,1737860552000
dbfs:/mnt/ecomdata1/countries_raw_2/,countries_raw_2/,0,1737860564000
dbfs:/mnt/ecomdata1/sellers_raw_2/,sellers_raw_2/,0,1737860541000
dbfs:/mnt/ecomdata1/users_raw_2/,users_raw_2/,0,1737860532000


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import * 

In [0]:
spark = SparkSession.builder.appName("ecomdata").getOrCreate()

In [0]:
spark

In [0]:
# Read parquet file from /mnt/ecomdata1/user-raw-2 folder
userDF = spark.read.format('parquet').option('inferSchema', 'true').option('header', 'true')\
    .load('/mnt/ecomdata1/users_raw_2')

# Step 2: Apply any necessary transformations (if applicable)
# Example transformation: userDF = userDF.withColumn("processed_at", current_timestamp())

In [0]:
userDF.show(2)

+--------------------+----+----------+--------+-----------------+---------------+-------------------+--------------+------------+----------------+--------------+--------------+------+----------------+-------------+---------+-------------+---------+-----------------+------------------+---------+-----------------+----------------+-----------+
|      identifierHash|type|   country|language|socialNbFollowers|socialNbFollows|socialProductsLiked|productsListed|productsSold|productsPassRate|productsWished|productsBought|gender|civilityGenderId|civilityTitle|hasAnyApp|hasAndroidApp|hasIosApp|hasProfilePicture|daysSinceLastLogin|seniority|seniorityAsMonths|seniorityAsYears|countryCode|
+--------------------+----+----------+--------+-----------------+---------------+-------------------+--------------+------------+----------------+--------------+--------------+------+----------------+-------------+---------+-------------+---------+-----------------+------------------+---------+-----------------+-

In [0]:
userDF.printSchema()

root
 |-- identifierHash: string (nullable = true)
 |-- type: string (nullable = true)
 |-- country: string (nullable = true)
 |-- language: string (nullable = true)
 |-- socialNbFollowers: string (nullable = true)
 |-- socialNbFollows: string (nullable = true)
 |-- socialProductsLiked: string (nullable = true)
 |-- productsListed: string (nullable = true)
 |-- productsSold: string (nullable = true)
 |-- productsPassRate: string (nullable = true)
 |-- productsWished: string (nullable = true)
 |-- productsBought: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- civilityGenderId: string (nullable = true)
 |-- civilityTitle: string (nullable = true)
 |-- hasAnyApp: string (nullable = true)
 |-- hasAndroidApp: string (nullable = true)
 |-- hasIosApp: string (nullable = true)
 |-- hasProfilePicture: string (nullable = true)
 |-- daysSinceLastLogin: string (nullable = true)
 |-- seniority: string (nullable = true)
 |-- seniorityAsMonths: string (nullable = true)
 |-- senio

In [0]:
countriesDF = spark.read.format('parquet').option('inferSchema', 'true').option('header', 'true')\
    .load('/mnt/ecomdata1/countries_raw_2')

sellersDF = spark.read.format('parquet').option('inferSchema', 'true').option('header', 'true')\
    .load('/mnt/ecomdata1/sellers_raw_2')

buyersDF = spark.read.format('parquet').option('inferSchema', 'true').option('header', 'true')\
    .load('/mnt/ecomdata1/buyers_raw_2')

In [0]:
from delta.tables import DeltaTable
# Define the Bronze table path
bronze_users_path = '/mnt/delta/bronze/tables/users'

#  Check if the Delta table already exists
if DeltaTable.isDeltaTable(spark, bronze_users_path):
    # Load the existing Delta table
    bronze_table = DeltaTable.forPath(spark, bronze_users_path)

    # Perform MERGE to handle both inserts and updates
    bronze_table.alias("target").merge(
        userDF.alias("source"),
        "target.identifierHash = source.identifierHash"  # Matching condition based on hash
    ).whenMatchedUpdateAll(  # Update all columns if user_id matches
    ).whenNotMatchedInsertAll(  # Insert new records if no match is found
    ).execute()

    print("Merge operation completed successfully.")
else:
    # If the Delta table does not exist, create it by writing the data
    userDF.write.format("delta").mode("overwrite").save(bronze_users_path)
    print("Target Delta table created successfully.")

Merge operation completed successfully.


In [0]:
%fs ls '/mnt/delta/bronze/tables/users'

path,name,size,modificationTime
dbfs:/mnt/delta/bronze/tables/users/_delta_log/,_delta_log/,0,1737877583000
dbfs:/mnt/delta/bronze/tables/users/part-00000-91d03433-7d0e-492b-b0d5-533f6664b824.c000.snappy.parquet,part-00000-91d03433-7d0e-492b-b0d5-533f6664b824.c000.snappy.parquet,479600,1737918883000
dbfs:/mnt/delta/bronze/tables/users/part-00000-a1695dba-7d9b-4797-97b5-2ace33e01492.c000.snappy.parquet,part-00000-a1695dba-7d9b-4797-97b5-2ace33e01492.c000.snappy.parquet,479600,1737937294000
dbfs:/mnt/delta/bronze/tables/users/part-00000-ceaa8b3d-bf9c-4700-b436-434e60ade99b.c000.snappy.parquet,part-00000-ceaa8b3d-bf9c-4700-b436-434e60ade99b.c000.snappy.parquet,479600,1737877585000


In [0]:
buyersDF.write.format('delta').mode('overwrite').save('/mnt/delta/bronze/tables/buyers')
sellersDF.write.format('delta').mode('overwrite').save('/mnt/delta/bronze/tables/sellers')
countriesDF.write.format('delta').mode('overwrite').save('/mnt/delta/bronze/tables/countries')