In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#### Data Reading from Silver Layer

In [0]:
df = spark.read.format("delta").load("/Volumes/netflix_catalog/netflix_schema/cleaned")
df.display()

In [0]:
df.count()

#### Creating Dimension table

In [0]:
if spark.catalog.tableExists('netflix_catalog.netflix_schema.netflix_dim_listed_in'):
    pass
else:
    # Creating Empty Dimension Table
    print("Creating Empty Dimension Table")
    df_dim = df.select(['show_id','listed_in']) \
                .filter("1=0")
    
    df_dim.write.saveAsTable("netflix_catalog.netflix_schema.netflix_dim_listed_in")

    # Applying Constraints
    spark.sql("""
        ALTER TABLE netflix_catalog.netflix_schema.netflix_dim_listed_in
        ADD CONSTRAINT all_not_nulls CHECK (
            show_id IS NOT NULL AND
            listed_in IS NOT NULL 
        )
    """)

    # Set Primary Key
    spark.sql("""
        ALTER TABLE netflix_catalog.netflix_schema.netflix_dim_listed_in
        SET TBLPROPERTIES (
            primaryKey = 'show_id, listed_in'
        )
    """)

In [0]:
%sql
DESCRIBE EXTENDED netflix_catalog.netflix_schema.netflix_dim_listed_in;

#### Loading Data into Dimension Table

In [0]:
df_dim = df.select(['show_id','listed_in'])

In [0]:
existing = spark.read.table("netflix_catalog.netflix_schema.netflix_dim_listed_in").select("show_id")
new_df = df_dim.join(existing, on="show_id", how="left_anti")

In [0]:
new_df.display()

In [0]:
new_df = new_df.withColumn("listed_in", explode(split(col("listed_in"), ","))) \
              .withColumn("listed_in", trim(col("listed_in")))

In [0]:
new_df.write.mode("append").saveAsTable("netflix_catalog.netflix_schema.netflix_dim_listed_in")

In [0]:
%sql
select * from netflix_catalog.netflix_schema.netflix_dim_listed_in

In [0]:
%sql
SELECT COUNT(*) AS total_rows FROM netflix_catalog.netflix_schema.netflix_dim_listed_in

In [0]:
%sql
SELECT COUNT(DISTINCT show_id) AS total_rows FROM netflix_catalog.netflix_schema.netflix_dim_listed_in