In [0]:
%sql
-- Create a catalog (if it doesn't already exist)
CREATE CATALOG IF NOT EXISTS catalog_for_sales;

-- Use the catalog
USE CATALOG catalog_for_sales;

-- Create schemas (if they don't exist)
CREATE SCHEMA IF NOT EXISTS bronze_schema;
CREATE SCHEMA IF NOT EXISTS silver_schema;
CREATE SCHEMA IF NOT EXISTS gold_schema;


In [0]:
from pyspark.sql.types import StructType, StructField,StringType, IntegerType, DoubleType
from pyspark.sql.functions import to_date, lit, concat, col, lpad,split

In [0]:
df = spark.read.format('parquet')\
      .options(inferSchema=True)\
        .load('abfss://bronze@projectandstorage.dfs.core.windows.net/rawdata')

In [0]:
# show the table
df.show(5)
# check the schema
df.printSchema()

In [0]:
df = df.withColumn("Revenue",col("Revenue").cast(DoubleType()))\
    .withColumn("Units_Sold",col("Units_Sold").cast(IntegerType()))\
    .withColumn("Day",col("Day").cast(IntegerType()))\
    .withColumn("Month",col("Month").cast(IntegerType()))\
    .withColumn("Year",col("Year").cast(IntegerType()))\

# check the Schema
df.printSchema()

# show the table
display(df)

In [0]:
duplicates = df.groupBy(df.columns).count().filter("count>1")
if duplicates.count()>0:
    print("Duplicates found:")
    duplicates.show()
else:
    print("No duplicates found")

# Drop duplicates
df= df.dropDuplicates()

In [0]:
null_counts= df.select([col(c).isNull().alias(c) for c in df.columns])
for column in null_counts.columns:
    null_count= null_counts.filter(col(column)).count()
    if null_count > 0:
        print(f"Null values found in column {column}: {null_count}")

# Define the threshold for dropping rows (50% null values)
threshold = int(len(df.columns) * 0.5)

# Count the number of nulls per row and drop rows with null count above the threshold
df = df.withColumn("null_count", sum(col(c).isNull().cast("int") for c in df.columns)) \
               .filter(col("null_count") <= threshold) \
               .drop("null_count")

In [0]:
# create Date column
df = df.withColumn("Date",to_date(concat(col("Year"), lit("-"), lpad(col("Month"), 2, "0"),lit("-"),lpad(col("Day"), 2, "0")), "yyyy-MM-dd"))

# create model_category
df = df.withColumn("Model_Category",split(col("Model_ID"),"-")[0])

# Calculate the Revenue per Unit column
df = df.withColumn("RevperUnit",col("Revenue")/col("Units_Sold"))
df.display()

In [0]:
# Data writting into silver_layer
df.write.format("delta")\
    .mode("overwrite")\
    .option("path","abfss://silver@projectandstorage.dfs.core.windows.net/carsales")\
    .save()