# Data reading

In [0]:
dbutils.widgets.text("p_ingestion_date","")
v_ingest_date = dbutils.widgets.get("p_ingestion_date")
# print(v_incre_flag)

In [0]:
df = spark.read.format("parquet") \
              .option("inferschema",True) \
              .load(f"abfss://bronze@cardeprojectdl.dfs.core.windows.net/rawdata/{v_ingest_date}")

In [0]:
df.display()

# Data transformation

In [0]:
from pyspark.sql.functions import split,col,count,sum,lit
from pyspark.sql.types import StringType

In [0]:
df = df.withColumn('model_category',split(col("Model_ID"),'-')[0])

In [0]:
df.withColumn('Units_Sold', col('Units_Sold').cast(StringType()))

DataFrame[Branch_ID: string, Dealer_ID: string, Model_ID: string, Revenue: bigint, Units_Sold: string, Date_ID: string, Day: int, Month: int, Year: int, BranchName: string, DealerName: string, model_category: string]

In [0]:
df.printSchema()

root
 |-- Branch_ID: string (nullable = true)
 |-- Dealer_ID: string (nullable = true)
 |-- Model_ID: string (nullable = true)
 |-- Revenue: long (nullable = true)
 |-- Units_Sold: long (nullable = true)
 |-- Date_ID: string (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- BranchName: string (nullable = true)
 |-- DealerName: string (nullable = true)
 |-- model_category: string (nullable = true)



In [0]:
df = df.withColumn('RevPerUnit',col('Revenue')/col('Units_Sold'))

In [0]:
df.display()

In [0]:
df = df.withColumn("ingestion_date", lit(v_ingest_date))

In [0]:
df.display()

# AGGREGATION

In [0]:
display(df.groupBy("Year", 'BranchName').agg(sum('Units_Sold').alias('Total_Units')).sort('Year','Total_Units',ascending= [True,False]))

# Data Writing

In [0]:
# %sql
# DROP TABLE cars_catalog.silver.silver_table;

In [0]:
# if spark.catalog.tableExists("cars_catalog.silver.silver_table"):
#   silver_df = spark.read.format("parquet") \
#               .load(f"abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales")
#   df = df.union(silver_df)

In [0]:
df.write.format("parquet") \
    .mode("append") \
    .option("path", 'abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales') \
    .saveAsTable("cars_catalog.silver.silver_table")

In [0]:
df.display()

In [0]:
# df.write.format("parquet") \
#     .mode("overwrite") \
#     .option("path", 'abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales') \
#     .save()

In [0]:
# from delta.tables import DeltaTable
# # spark.conf.set("spark.databricks.optimizer.dynamicPartitionPrueving", "true")

# if spark.catalog.tableExists(f"cars_catalog.silver.silver_table"):
#     # deltaTable = DeltaTable.forPath(spark, "cars_catalog.silver.silver_table")
#     # target_table = spark.read.format("")
#     target_table = spark.read.format("parquet") \
#               .load(f"abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales")

#     target_table.alias("tgt").merge(
#             df.alias("src"),
#         "tgt.Branch_ID = src.Branch_ID and tgt.Dealer_ID = src.Dealer_ID and tgt.Model_ID = src.Model_ID and tgt.Date_ID = src.Date_ID"
#     ) \
#         .whenMatchedUpdateAll() \
#         .whenNotMatchedInsertAll() \
#         .execute()
# else:
#     df.write.mode("overwrite").format("parquet").option("path", 'abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales').saveAsTable("cars_catalog.silver.silver_table")

# Querying Silver Data

In [0]:
%sql
SELECT * FROM PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`

In [0]:
%sql
SELECT * FROM cars_catalog.silver.silver_table

In [0]:
# %sql
# DROP TABLE cars_catalog.silver.silver_table;