### Bronze Tables

In [0]:
%sql
select * from workspace.imdb_final_project.bronze_name_basics

In [0]:
%sql
select * from workspace.imdb_final_project.bronze_title_akas

In [0]:
%sql
select * from workspace.imdb_final_project.bronze_title_basics

In [0]:
%sql
select * from imdb_final_project.bronze_title_crew

In [0]:
%sql
select * from imdb_final_project.bronze_title_episode

In [0]:
%sql
select * from imdb_final_project.bronze_title_principals

In [0]:
%sql
select * from imdb_final_project.bronze_title_ratings

### Row count for bronze

In [0]:
%sql
-- All Row Counts in Single Query
SELECT 'bronze_name_basics' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_name_basics

UNION ALL

SELECT 'bronze_title_akas' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_akas

UNION ALL

SELECT 'bronze_title_basics' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_basics

UNION ALL

SELECT 'bronze_title_crew' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_crew

UNION ALL

SELECT 'bronze_title_episode' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_episode

UNION ALL

SELECT 'bronze_title_principals' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_principals

UNION ALL

SELECT 'bronze_title_ratings' as table_name, COUNT(*) as row_count
FROM imdb_final_project.bronze_title_ratings

ORDER BY table_name;

### Silver Tables

In [0]:
%sql
-- All Row Counts in Single Query - Silver Tables
SELECT 'silver_name_basics' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_name_basics

UNION ALL

SELECT 'silver_title_akas' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_akas

UNION ALL

SELECT 'silver_title_basics' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_basics

UNION ALL

SELECT 'silver_title_crew' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_crew

UNION ALL

SELECT 'silver_title_episode' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_episode

UNION ALL

SELECT 'silver_title_principals' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_principals

UNION ALL

SELECT 'silver_title_ratings' as table_name, COUNT(*) as row_count
FROM imdb_final_project.silver_title_ratings

ORDER BY table_name;

### Gold Layer

In [0]:
%sql
select * from imdb_final_project.gold_bridge_akas

In [0]:
%sql
select * from imdb_final_project.gold_bridge_profession

In [0]:
%sql
select * from imdb_final_project.gold_bridge_title_crew

In [0]:
%sql
select * from imdb_final_project.gold_bridge_title_genre

In [0]:
%sql
select * from imdb_final_project.gold_dim_crew

In [0]:
%sql
select * from imdb_final_project.gold_dim_genre

In [0]:
%sql
select * from imdb_final_project.gold_dim_language

In [0]:
%sql
select * from imdb_final_project.gold_dim_region

In [0]:
%sql
select * from imdb_final_project.gold_dim_name

In [0]:
%sql
select * from imdb_final_project.gold_dim_principals

In [0]:
%sql
select * from imdb_final_project.gold_dim_profession

In [0]:
%sql
select * from imdb_final_project.gold_dim_title

### Fact Tables

In [0]:
%sql
select * from imdb_final_project.gold_fact_episodes

In [0]:
%sql
select * from imdb_final_project.gold_fact_title_ratings

### Null Analysis

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Layer - Null Analysis for All Tables

# COMMAND ----------

from pyspark.sql.functions import col, count, when, isnan, sum as _sum, lit
from pyspark.sql.types import DoubleType, FloatType

def analyze_nulls(df, table_name):
    """
    Comprehensive null analysis for a DataFrame
    Returns a summary DataFrame with null counts and percentages
    """
    total_rows = df.count()
    
    null_counts = []
    
    for column in df.columns:
        col_type = df.schema[column].dataType
        
        # Count NULLs
        null_count = df.filter(col(column).isNull()).count()
        
        # For numeric columns, also check for NaN
        if isinstance(col_type, (DoubleType, FloatType)):
            nan_count = df.filter(isnan(col(column))).count()
        else:
            nan_count = 0
        
        # Count empty strings for string columns
        if col_type.typeName() == 'string':
            empty_count = df.filter((col(column) == "") | (col(column) == "\\N")).count()
        else:
            empty_count = 0
        
        total_nullish = null_count + nan_count + empty_count
        null_percentage = (total_nullish / total_rows * 100) if total_rows > 0 else 0
        
        null_counts.append({
            'table_name': table_name,
            'column_name': column,
            'total_rows': total_rows,
            'null_count': null_count,
            'nan_count': nan_count,
            'empty_string_count': empty_count,
            'total_nullish': total_nullish,
            'null_percentage': round(null_percentage, 2)
        })
    
    return spark.createDataFrame(null_counts)

print("Null analysis function loaded")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Dimension Tables Null Analysis

# COMMAND ----------

# 1. DIM_Region
print("=" * 80)
print("ANALYZING: gold_DIM_Region")
print("=" * 80)
dim_region = spark.read.table("imdb_final_project.gold_DIM_Region")
region_nulls = analyze_nulls(dim_region, "gold_DIM_Region")
region_nulls.show(truncate=False)

# COMMAND ----------

# 2. DIM_Language
print("=" * 80)
print("ANALYZING: gold_DIM_Language")
print("=" * 80)
dim_language = spark.read.table("imdb_final_project.gold_DIM_Language")
language_nulls = analyze_nulls(dim_language, "gold_DIM_Language")
language_nulls.show(truncate=False)

# COMMAND ----------

# 3. DIM_NAME
print("=" * 80)
print("ANALYZING: gold_DIM_NAME")
print("=" * 80)
dim_name = spark.read.table("imdb_final_project.gold_DIM_NAME")
name_nulls = analyze_nulls(dim_name, "gold_DIM_NAME")
name_nulls.show(truncate=False)

# COMMAND ----------

# 4. DIM_Title
print("=" * 80)
print("ANALYZING: gold_DIM_Title")
print("=" * 80)
dim_title = spark.read.table("imdb_final_project.gold_DIM_Title")
title_nulls = analyze_nulls(dim_title, "gold_DIM_Title")
title_nulls.show(truncate=False)

# COMMAND ----------

# 5. DIM_Genre
print("=" * 80)
print("ANALYZING: gold_DIM_Genre")
print("=" * 80)
dim_genre = spark.read.table("imdb_final_project.gold_DIM_Genre")
genre_nulls = analyze_nulls(dim_genre, "gold_DIM_Genre")
genre_nulls.show(truncate=False)

# COMMAND ----------

# 6. DIM_Profession
print("=" * 80)
print("ANALYZING: gold_DIM_Profession")
print("=" * 80)
dim_profession = spark.read.table("imdb_final_project.gold_DIM_Profession")
profession_nulls = analyze_nulls(dim_profession, "gold_DIM_Profession")
profession_nulls.show(truncate=False)

# COMMAND ----------

# 7. DIM_Crew
print("=" * 80)
print("ANALYZING: gold_DIM_Crew")
print("=" * 80)
dim_crew = spark.read.table("imdb_final_project.gold_DIM_Crew")
crew_nulls = analyze_nulls(dim_crew, "gold_DIM_Crew")
crew_nulls.show(truncate=False)

# COMMAND ----------

# 8. DIM_Principals
print("=" * 80)
print("ANALYZING: gold_DIM_Principals")
print("=" * 80)
dim_principals = spark.read.table("imdb_final_project.gold_DIM_Principals")
principals_nulls = analyze_nulls(dim_principals, "gold_DIM_Principals")
principals_nulls.show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Fact Tables Null Analysis

# COMMAND ----------

# 9. FACT_Title_Ratings
print("=" * 80)
print("ANALYZING: gold_FACT_Title_Ratings")
print("=" * 80)
fact_ratings = spark.read.table("imdb_final_project.gold_FACT_Title_Ratings")
ratings_nulls = analyze_nulls(fact_ratings, "gold_FACT_Title_Ratings")
ratings_nulls.show(truncate=False)

# COMMAND ----------

# 10. FACT_Episodes
print("=" * 80)
print("ANALYZING: gold_FACT_Episodes")
print("=" * 80)
fact_episodes = spark.read.table("imdb_final_project.gold_FACT_Episodes")
episodes_nulls = analyze_nulls(fact_episodes, "gold_FACT_Episodes")
episodes_nulls.show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Bridge Tables Null Analysis

# COMMAND ----------

# 11. BRIDGE_TITLE_GENRE
print("=" * 80)
print("ANALYZING: gold_BRIDGE_TITLE_GENRE")
print("=" * 80)
bridge_genre = spark.read.table("imdb_final_project.gold_BRIDGE_TITLE_GENRE")
bridge_genre_nulls = analyze_nulls(bridge_genre, "gold_BRIDGE_TITLE_GENRE")
bridge_genre_nulls.show(truncate=False)

# COMMAND ----------

# 12. BRIDGE_PROFESSION
print("=" * 80)
print("ANALYZING: gold_BRIDGE_PROFESSION")
print("=" * 80)
bridge_profession = spark.read.table("imdb_final_project.gold_BRIDGE_PROFESSION")
bridge_profession_nulls = analyze_nulls(bridge_profession, "gold_BRIDGE_PROFESSION")
bridge_profession_nulls.show(truncate=False)

# COMMAND ----------

# 13. Bridge_Title_Crew
print("=" * 80)
print("ANALYZING: gold_Bridge_Title_Crew")
print("=" * 80)
bridge_crew = spark.read.table("imdb_final_project.gold_Bridge_Title_Crew")
bridge_crew_nulls = analyze_nulls(bridge_crew, "gold_Bridge_Title_Crew")
bridge_crew_nulls.show(truncate=False)

# COMMAND ----------

# 14. BRIDGE_Akas
print("=" * 80)
print("ANALYZING: gold_BRIDGE_Akas")
print("=" * 80)
bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")
bridge_akas_nulls = analyze_nulls(bridge_akas, "gold_BRIDGE_Akas")
bridge_akas_nulls.show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Consolidated Null Summary - All Tables

# COMMAND ----------

print("=" * 80)
print("CONSOLIDATED NULL ANALYSIS - ALL GOLD TABLES")
print("=" * 80)

# Combine all null analyses
all_nulls = (
    region_nulls
    .union(language_nulls)
    .union(name_nulls)
    .union(title_nulls)
    .union(genre_nulls)
    .union(profession_nulls)
    .union(crew_nulls)
    .union(principals_nulls)
    .union(ratings_nulls)
    .union(episodes_nulls)
    .union(bridge_genre_nulls)
    .union(bridge_profession_nulls)
    .union(bridge_crew_nulls)
    .union(bridge_akas_nulls)
)

# Show only columns with nulls
columns_with_nulls = all_nulls.filter(col("total_nullish") > 0).orderBy(col("null_percentage").desc())

print("\nüìä COLUMNS WITH NULL VALUES (sorted by null percentage):")
columns_with_nulls.show(100, truncate=False)

# Summary statistics
print("\nüìà SUMMARY STATISTICS:")
all_nulls.groupBy("table_name").agg(
    count("*").alias("total_columns"),
    _sum(when(col("total_nullish") > 0, 1).otherwise(0)).alias("columns_with_nulls"),
    _sum("total_nullish").alias("total_null_values")
).orderBy("columns_with_nulls", ascending=False).show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Row Count Summary - All Tables

# COMMAND ----------

print("=" * 80)
print("ROW COUNT SUMMARY - ALL GOLD TABLES")
print("=" * 80)

table_counts = [
    {"table_name": "gold_DIM_Region", "row_count": dim_region.count()},
    {"table_name": "gold_DIM_Language", "row_count": dim_language.count()},
    {"table_name": "gold_DIM_NAME", "row_count": dim_name.count()},
    {"table_name": "gold_DIM_Title", "row_count": dim_title.count()},
    {"table_name": "gold_DIM_Genre", "row_count": dim_genre.count()},
    {"table_name": "gold_DIM_Profession", "row_count": dim_profession.count()},
    {"table_name": "gold_DIM_Crew", "row_count": dim_crew.count()},
    {"table_name": "gold_DIM_Principals", "row_count": dim_principals.count()},
    {"table_name": "gold_FACT_Title_Ratings", "row_count": fact_ratings.count()},
    {"table_name": "gold_FACT_Episodes", "row_count": fact_episodes.count()},
    {"table_name": "gold_BRIDGE_TITLE_GENRE", "row_count": bridge_genre.count()},
    {"table_name": "gold_BRIDGE_PROFESSION", "row_count": bridge_profession.count()},
    {"table_name": "gold_Bridge_Title_Crew", "row_count": bridge_crew.count()},
    {"table_name": "gold_BRIDGE_Akas", "row_count": bridge_akas.count()}
]

counts_df = spark.createDataFrame(table_counts).orderBy(col("row_count").desc())
counts_df.show(truncate=False)

print(f"\n‚úÖ Total tables analyzed: {len(table_counts)}")
print(f"‚ö†Ô∏è  Empty tables: {counts_df.filter(col('row_count') == 0).count()}")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Quick Validation Checks

# COMMAND ----------

print("=" * 80)
print("QUICK VALIDATION CHECKS")
print("=" * 80)

# Check for empty tables
empty_tables = [t['table_name'] for t in table_counts if t['row_count'] == 0]
if empty_tables:
    print(f"\n‚ö†Ô∏è  WARNING: Empty tables found: {', '.join(empty_tables)}")
else:
    print("\n‚úÖ All tables have data")

# Check for high null percentages (>50%)
high_null_columns = columns_with_nulls.filter(col("null_percentage") > 50)
high_null_count = high_null_columns.count()

if high_null_count > 0:
    print(f"\n‚ö†Ô∏è  WARNING: {high_null_count} columns have >50% null values:")
    high_null_columns.select("table_name", "column_name", "null_percentage").show(truncate=False)
else:
    print("\n‚úÖ No columns with >50% null values")

# Check dimension table integrity
print("\nüìä DIMENSION TABLE INTEGRITY:")
print(f"   Regions: {dim_region.count()} records")
print(f"   Languages: {dim_language.count()} records")
print(f"   Names: {dim_name.count()} records")
print(f"   Titles: {dim_title.count()} records")
print(f"   Genres: {dim_genre.count()} records")
print(f"   Professions: {dim_profession.count()} records")

print("\n‚úÖ Null analysis complete!")

### Duplicate Analysis

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Layer - Duplicate Analysis for All Tables

# COMMAND ----------

from pyspark.sql.functions import col, count, sum as _sum, desc, lit
from pyspark.sql import Window

def analyze_duplicates(df, table_name, key_columns, description=""):
    """
    Comprehensive duplicate analysis for a DataFrame
    
    Parameters:
    - df: DataFrame to analyze
    - table_name: Name of the table
    - key_columns: List of columns that should be unique (business key)
    - description: Optional description of what makes a record unique
    """
    total_rows = df.count()
    
    if total_rows == 0:
        print(f"‚ö†Ô∏è  {table_name} is EMPTY - skipping duplicate analysis")
        return None
    
    # Count duplicates based on key columns
    duplicate_check = (
        df.groupBy(key_columns)
        .agg(count("*").alias("occurrence_count"))
        .filter(col("occurrence_count") > 1)
    )
    
    duplicate_count = duplicate_check.count()
    total_duplicate_rows = duplicate_check.agg(_sum("occurrence_count")).collect()[0][0]
    
    if total_duplicate_rows:
        total_duplicate_rows = int(total_duplicate_rows)
    else:
        total_duplicate_rows = 0
    
    duplicate_percentage = (total_duplicate_rows / total_rows * 100) if total_rows > 0 else 0
    
    print("=" * 100)
    print(f"TABLE: {table_name}")
    print(f"DESCRIPTION: {description}")
    print(f"KEY COLUMNS: {', '.join(key_columns)}")
    print("=" * 100)
    print(f"Total Rows: {total_rows:,}")
    print(f"Unique Key Combinations: {total_rows - total_duplicate_rows + duplicate_count:,}")
    print(f"Duplicate Key Combinations: {duplicate_count:,}")
    print(f"Total Duplicate Rows: {total_duplicate_rows:,}")
    print(f"Duplicate Percentage: {duplicate_percentage:.2f}%")
    
    if duplicate_count > 0:
        print(f"\n‚ö†Ô∏è  WARNING: {duplicate_count} duplicate key combinations found!")
        print("\nTop 10 duplicates:")
        duplicate_check.orderBy(desc("occurrence_count")).show(10, truncate=False)
        
        # Show sample duplicate records
        print("\nSample duplicate records:")
        duplicate_keys = duplicate_check.limit(3)
        for row in duplicate_keys.collect():
            key_values = {k: row[k] for k in key_columns}
            filter_condition = None
            for k, v in key_values.items():
                condition = col(k) == v
                filter_condition = condition if filter_condition is None else filter_condition & condition
            
            print(f"\n  Duplicate set: {key_values}")
            df.filter(filter_condition).show(5, truncate=False)
    else:
        print("\n‚úÖ No duplicates found - all records are unique!")
    
    print("\n")
    
    return {
        "table_name": table_name,
        "total_rows": total_rows,
        "duplicate_count": duplicate_count,
        "total_duplicate_rows": total_duplicate_rows,
        "duplicate_percentage": round(duplicate_percentage, 2)
    }

print("Duplicate analysis function loaded")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Dimension Tables Duplicate Analysis

# COMMAND ----------

# 1. DIM_Region
dim_region = spark.read.table("imdb_final_project.gold_DIM_Region")
region_dups = analyze_duplicates(
    dim_region, 
    "gold_DIM_Region", 
    ["RegionKey"],
    "Each region should have unique RegionKey (surrogate key)"
)

# Also check business key
region_dups_business = analyze_duplicates(
    dim_region, 
    "gold_DIM_Region (Business Key)", 
    ["RegionCode"],
    "Each region should have unique RegionCode (business key)"
)

# COMMAND ----------

# 2. DIM_Language
dim_language = spark.read.table("imdb_final_project.gold_DIM_Language")
language_dups = analyze_duplicates(
    dim_language, 
    "gold_DIM_Language", 
    ["LanguageKey"],
    "Each language should have unique LanguageKey (surrogate key)"
)

language_dups_business = analyze_duplicates(
    dim_language, 
    "gold_DIM_Language (Business Key)", 
    ["LanguageCode"],
    "Each language should have unique LanguageCode (business key)"
)

# COMMAND ----------

# 3. DIM_NAME
dim_name = spark.read.table("imdb_final_project.gold_DIM_NAME")
name_dups = analyze_duplicates(
    dim_name, 
    "gold_DIM_NAME", 
    ["NameKey"],
    "Each person should have unique NameKey (surrogate key)"
)

name_dups_business = analyze_duplicates(
    dim_name, 
    "gold_DIM_NAME (Business Key)", 
    ["NCONST"],
    "Each person should have unique NCONST (business key)"
)

# COMMAND ----------

# 4. DIM_Title
dim_title = spark.read.table("imdb_final_project.gold_DIM_Title")
title_dups = analyze_duplicates(
    dim_title, 
    "gold_DIM_Title", 
    ["TitleKey"],
    "Each title should have unique TitleKey (surrogate key)"
)

title_dups_business = analyze_duplicates(
    dim_title, 
    "gold_DIM_Title (Business Key)", 
    ["Tconst"],
    "Each title should have unique Tconst (business key)"
)

# COMMAND ----------

# 5. DIM_Genre
dim_genre = spark.read.table("imdb_final_project.gold_DIM_Genre")
genre_dups = analyze_duplicates(
    dim_genre, 
    "gold_DIM_Genre", 
    ["GenreKey"],
    "Each genre should have unique GenreKey (surrogate key)"
)

genre_dups_business = analyze_duplicates(
    dim_genre, 
    "gold_DIM_Genre (Business Key)", 
    ["GenreName"],
    "Each genre should have unique GenreName (business key)"
)

# COMMAND ----------

# 6. DIM_Profession
dim_profession = spark.read.table("imdb_final_project.gold_DIM_Profession")
profession_dups = analyze_duplicates(
    dim_profession, 
    "gold_DIM_Profession", 
    ["ProfessionKey"],
    "Each profession should have unique ProfessionKey (surrogate key)"
)

profession_dups_business = analyze_duplicates(
    dim_profession, 
    "gold_DIM_Profession (Business Key)", 
    ["Profession"],
    "Each profession should have unique Profession (business key)"
)

# COMMAND ----------

# 7. DIM_Crew
dim_crew = spark.read.table("imdb_final_project.gold_DIM_Crew")
crew_dups = analyze_duplicates(
    dim_crew, 
    "gold_DIM_Crew", 
    ["CrewKey"],
    "Each crew role should have unique CrewKey (surrogate key)"
)

crew_dups_business = analyze_duplicates(
    dim_crew, 
    "gold_DIM_Crew (Business Key)", 
    ["Crew_Role"],
    "Each crew role should have unique Crew_Role (business key)"
)

# COMMAND ----------

# 8. DIM_Principals
dim_principals = spark.read.table("imdb_final_project.gold_DIM_Principals")
principals_dups = analyze_duplicates(
    dim_principals, 
    "gold_DIM_Principals", 
    ["PrincipalKey"],
    "Each principal record should have unique PrincipalKey (surrogate key)"
)

principals_dups_business = analyze_duplicates(
    dim_principals, 
    "gold_DIM_Principals (Business Key)", 
    ["TitleKey", "NameKey", "ordering"],
    "Each combination of TitleKey, NameKey, and ordering should be unique"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Fact Tables Duplicate Analysis

# COMMAND ----------

# 9. FACT_Title_Ratings
fact_ratings = spark.read.table("imdb_final_project.gold_FACT_Title_Ratings")
ratings_dups = analyze_duplicates(
    fact_ratings, 
    "gold_FACT_Title_Ratings", 
    ["RatingKey"],
    "Each rating record should have unique RatingKey (surrogate key)"
)

ratings_dups_business = analyze_duplicates(
    fact_ratings, 
    "gold_FACT_Title_Ratings (Business Key)", 
    ["TitleKey"],
    "Each title should have only one rating record"
)

# COMMAND ----------

# 10. FACT_Episodes
fact_episodes = spark.read.table("imdb_final_project.gold_FACT_Episodes")
episodes_dups = analyze_duplicates(
    fact_episodes, 
    "gold_FACT_Episodes", 
    ["EpisodeKey"],
    "Each episode record should have unique EpisodeKey (surrogate key)"
)

episodes_dups_business = analyze_duplicates(
    fact_episodes, 
    "gold_FACT_Episodes (Business Key)", 
    ["TitleKey"],
    "Each episode title should appear only once"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Bridge Tables Duplicate Analysis

# COMMAND ----------

# 11. BRIDGE_TITLE_GENRE
bridge_genre = spark.read.table("imdb_final_project.gold_BRIDGE_TITLE_GENRE")
bridge_genre_dups = analyze_duplicates(
    bridge_genre, 
    "gold_BRIDGE_TITLE_GENRE", 
    ["TitleGenreKey"],
    "Each bridge record should have unique TitleGenreKey (surrogate key)"
)

bridge_genre_dups_business = analyze_duplicates(
    bridge_genre, 
    "gold_BRIDGE_TITLE_GENRE (Business Key)", 
    ["TitleKey", "GenreKey"],
    "Each Title-Genre combination should be unique"
)

# COMMAND ----------

# 12. BRIDGE_PROFESSION
bridge_profession = spark.read.table("imdb_final_project.gold_BRIDGE_PROFESSION")
bridge_profession_dups = analyze_duplicates(
    bridge_profession, 
    "gold_BRIDGE_PROFESSION", 
    ["titleProfessionKey"],
    "Each bridge record should have unique titleProfessionKey (surrogate key)"
)

bridge_profession_dups_business = analyze_duplicates(
    bridge_profession, 
    "gold_BRIDGE_PROFESSION (Business Key)", 
    ["NameKey", "ProfessionKey"],
    "Each Name-Profession combination should be unique"
)

# COMMAND ----------

# 13. Bridge_Title_Crew
bridge_crew = spark.read.table("imdb_final_project.gold_Bridge_Title_Crew")
bridge_crew_dups = analyze_duplicates(
    bridge_crew, 
    "gold_Bridge_Title_Crew", 
    ["titleCrewKey"],
    "Each bridge record should have unique titleCrewKey (surrogate key)"
)

bridge_crew_dups_business = analyze_duplicates(
    bridge_crew, 
    "gold_Bridge_Title_Crew (Business Key)", 
    ["TitleKey", "NameKey", "CrewKey"],
    "Each Title-Name-Crew combination should be unique"
)

# COMMAND ----------

# 14. BRIDGE_Akas
bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")
bridge_akas_dups = analyze_duplicates(
    bridge_akas, 
    "gold_BRIDGE_Akas", 
    ["TitleAkasKey"],
    "Each akas record should have unique TitleAkasKey (surrogate key)"
)

bridge_akas_dups_business = analyze_duplicates(
    bridge_akas, 
    "gold_BRIDGE_Akas (Business Key)", 
    ["TitleKey", "RegionKey", "LanguageKey", "AkasTitle"],
    "Each Title-Region-Language-AkasTitle combination should be unique"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Consolidated Duplicate Summary

# COMMAND ----------

print("=" * 100)
print("CONSOLIDATED DUPLICATE ANALYSIS - ALL GOLD TABLES")
print("=" * 100)

# Collect all results
all_duplicate_results = [
    region_dups, region_dups_business,
    language_dups, language_dups_business,
    name_dups, name_dups_business,
    title_dups, title_dups_business,
    genre_dups, genre_dups_business,
    profession_dups, profession_dups_business,
    crew_dups, crew_dups_business,
    principals_dups, principals_dups_business,
    ratings_dups, ratings_dups_business,
    episodes_dups, episodes_dups_business,
    bridge_genre_dups, bridge_genre_dups_business,
    bridge_profession_dups, bridge_profession_dups_business,
    bridge_crew_dups, bridge_crew_dups_business,
    bridge_akas_dups, bridge_akas_dups_business
]

# Filter out None values (from empty tables)
all_duplicate_results = [r for r in all_duplicate_results if r is not None]

# Create summary DataFrame
duplicate_summary_df = spark.createDataFrame(all_duplicate_results)

# Show tables with duplicates
tables_with_duplicates = duplicate_summary_df.filter(col("duplicate_count") > 0)

if tables_with_duplicates.count() > 0:
    print("\n‚ö†Ô∏è  TABLES WITH DUPLICATES:")
    tables_with_duplicates.orderBy(desc("duplicate_percentage")).show(50, truncate=False)
else:
    print("\n‚úÖ NO DUPLICATES FOUND IN ANY TABLE!")

# Show overall summary
print("\nüìä OVERALL SUMMARY:")
duplicate_summary_df.orderBy("table_name").show(50, truncate=False)

# Statistics
total_tables = duplicate_summary_df.count()
tables_with_dups = tables_with_duplicates.count()
total_rows = duplicate_summary_df.agg(_sum("total_rows")).collect()[0][0]
total_dup_rows = duplicate_summary_df.agg(_sum("total_duplicate_rows")).collect()[0][0]

print(f"\nüìà STATISTICS:")
print(f"   Total tables analyzed: {total_tables}")
print(f"   Tables with duplicates: {tables_with_dups}")
print(f"   Tables without duplicates: {total_tables - tables_with_dups}")
print(f"   Total rows across all tables: {total_rows:,}")
print(f"   Total duplicate rows: {total_dup_rows:,}")
print(f"   Overall duplicate rate: {(total_dup_rows / total_rows * 100):.2f}%")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Referential Integrity Checks

# COMMAND ----------

print("=" * 100)
print("REFERENTIAL INTEGRITY CHECKS")
print("=" * 100)

# Check if all foreign keys in bridge tables reference existing dimension keys

# 1. BRIDGE_TITLE_GENRE referential integrity
print("\n1. Checking BRIDGE_TITLE_GENRE ‚Üí DIM_Title and DIM_Genre")
invalid_title_refs = bridge_genre.join(dim_title, bridge_genre.TitleKey == dim_title.TitleKey, "left_anti")
invalid_genre_refs = bridge_genre.join(dim_genre, bridge_genre.GenreKey == dim_genre.GenreKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_title_refs.count()}")
print(f"   Invalid GenreKey references: {invalid_genre_refs.count()}")

# 2. BRIDGE_PROFESSION referential integrity
print("\n2. Checking BRIDGE_PROFESSION ‚Üí DIM_NAME and DIM_Profession")
invalid_name_refs = bridge_profession.join(dim_name, bridge_profession.NameKey == dim_name.NameKey, "left_anti")
invalid_prof_refs = bridge_profession.join(dim_profession, bridge_profession.ProfessionKey == dim_profession.ProfessionKey, "left_anti")
print(f"   Invalid NameKey references: {invalid_name_refs.count()}")
print(f"   Invalid ProfessionKey references: {invalid_prof_refs.count()}")

# 3. Bridge_Title_Crew referential integrity
print("\n3. Checking Bridge_Title_Crew ‚Üí DIM_Title, DIM_NAME, and DIM_Crew")
invalid_crew_title_refs = bridge_crew.join(dim_title, bridge_crew.TitleKey == dim_title.TitleKey, "left_anti")
invalid_crew_name_refs = bridge_crew.join(dim_name, bridge_crew.NameKey == dim_name.NameKey, "left_anti")
invalid_crew_refs = bridge_crew.join(dim_crew, bridge_crew.CrewKey == dim_crew.CrewKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_crew_title_refs.count()}")
print(f"   Invalid NameKey references: {invalid_crew_name_refs.count()}")
print(f"   Invalid CrewKey references: {invalid_crew_refs.count()}")

# 4. BRIDGE_Akas referential integrity
print("\n4. Checking BRIDGE_Akas ‚Üí DIM_Title, DIM_Region, and DIM_Language")
# Note: -9999 is expected for missing regions/languages
invalid_akas_title_refs = bridge_akas.join(dim_title, bridge_akas.TitleKey == dim_title.TitleKey, "left_anti")
invalid_region_refs = bridge_akas.filter(col("RegionKey") != -9999).join(dim_region, bridge_akas.RegionKey == dim_region.RegionKey, "left_anti")
invalid_language_refs = bridge_akas.filter(col("LanguageKey") != -9999).join(dim_language, bridge_akas.LanguageKey == dim_language.LanguageKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_akas_title_refs.count()}")
print(f"   Invalid RegionKey references (excluding -9999): {invalid_region_refs.count()}")
print(f"   Invalid LanguageKey references (excluding -9999): {invalid_language_refs.count()}")

# 5. FACT_Title_Ratings referential integrity
print("\n5. Checking FACT_Title_Ratings ‚Üí DIM_Title")
invalid_ratings_refs = fact_ratings.join(dim_title, fact_ratings.TitleKey == dim_title.TitleKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_ratings_refs.count()}")

# 6. FACT_Episodes referential integrity
print("\n6. Checking FACT_Episodes ‚Üí DIM_Title")
invalid_episodes_refs = fact_episodes.join(dim_title, fact_episodes.TitleKey == dim_title.TitleKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_episodes_refs.count()}")

# 7. DIM_Principals referential integrity
print("\n7. Checking DIM_Principals ‚Üí DIM_Title and DIM_NAME")
invalid_principals_title_refs = dim_principals.join(dim_title, dim_principals.TitleKey == dim_title.TitleKey, "left_anti")
invalid_principals_name_refs = dim_principals.join(dim_name, dim_principals.NameKey == dim_name.NameKey, "left_anti")
print(f"   Invalid TitleKey references: {invalid_principals_title_refs.count()}")
print(f"   Invalid NameKey references: {invalid_principals_name_refs.count()}")

print("\n‚úÖ Referential integrity checks complete!")

# COMMAND ----------

print("=" * 100)
print("DUPLICATE ANALYSIS COMPLETE!")
print("=" * 100)

### Entire row duplicate

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Layer - Duplicate Analysis (Excluding Primary Keys)

# COMMAND ----------

from pyspark.sql.functions import (
    col, count, sum as _sum, desc, lit, md5, concat_ws, 
    coalesce, row_number  # ‚Üê ADD THESE
)
from pyspark.sql import Window

def analyze_duplicates_excluding_pk(df, table_name, primary_key_column, description=""):
    """
    Comprehensive duplicate analysis - checks if entire rows are identical EXCEPT primary key
    
    Parameters:
    - df: DataFrame to analyze
    - table_name: Name of the table
    - primary_key_column: Primary key column to EXCLUDE from duplicate check
    - description: Optional description of the table
    """
    total_rows = df.count()
    
    if total_rows == 0:
        print(f"‚ö†Ô∏è  {table_name} is EMPTY - skipping duplicate analysis")
        return None
    
    # Get all columns except primary key
    all_columns = [c for c in df.columns if c != primary_key_column]
    
    if len(all_columns) == 0:
        print(f"‚ö†Ô∏è  {table_name} has only primary key column - skipping")
        return None
    
    # Create a hash of all columns EXCEPT primary key
    df_with_hash = df.withColumn(
        "row_hash", 
        md5(concat_ws("||", *[coalesce(col(c).cast("string"), lit("NULL")) for c in all_columns]))
    )
    
    # Count duplicates based on hash (entire row except PK)
    duplicate_check = (
        df_with_hash
        .groupBy("row_hash")
        .agg(count("*").alias("occurrence_count"))
        .filter(col("occurrence_count") > 1)
    )
    
    duplicate_count = duplicate_check.count()
    total_duplicate_rows = duplicate_check.agg(_sum("occurrence_count")).collect()[0][0]
    
    if total_duplicate_rows:
        total_duplicate_rows = int(total_duplicate_rows) - duplicate_count  # Subtract one occurrence per duplicate set
    else:
        total_duplicate_rows = 0
    
    duplicate_percentage = (total_duplicate_rows / total_rows * 100) if total_rows > 0 else 0
    
    print("=" * 100)
    print(f"TABLE: {table_name}")
    print(f"DESCRIPTION: {description}")
    print(f"PRIMARY KEY (EXCLUDED): {primary_key_column}")
    print(f"CHECKING: Row duplicates excluding primary key")
    print("=" * 100)
    print(f"Total Rows: {total_rows:,}")
    print(f"Unique Rows (excluding PK): {total_rows - total_duplicate_rows:,}")
    print(f"Duplicate Sets: {duplicate_count:,}")
    print(f"Extra Duplicate Rows: {total_duplicate_rows:,}")
    print(f"Duplicate Percentage: {duplicate_percentage:.2f}%")
    
    if duplicate_count > 0:
        print(f"\n‚ö†Ô∏è  WARNING: {duplicate_count} sets of duplicates found!")
        print(f"    (Total {total_duplicate_rows} extra rows that are duplicates)")
        print(f"    These rows have identical values in all columns EXCEPT {primary_key_column}")
        
        # Show top duplicates by occurrence
        print("\nTop 10 duplicate sets by occurrence count:")
        duplicate_check.orderBy(desc("occurrence_count")).show(10, truncate=False)
        
        # Show sample duplicate records with their primary keys
        print(f"\nSample duplicate records (showing first 3 duplicate sets):")
        print(f"Note: {primary_key_column} values are DIFFERENT, but all other columns are IDENTICAL")
        top_duplicate_hashes = duplicate_check.orderBy(desc("occurrence_count")).limit(3).select("row_hash").collect()
        
        for idx, row in enumerate(top_duplicate_hashes, 1):
            hash_value = row["row_hash"]
            print(f"\n  Duplicate Set #{idx}:")
            duplicate_rows = df_with_hash.filter(col("row_hash") == hash_value).drop("row_hash")
            duplicate_rows.show(10, truncate=False)
    else:
        print(f"\n‚úÖ No duplicates found - all records are unique (excluding {primary_key_column})!")
    
    print("\n")
    
    return {
        "table_name": table_name,
        "primary_key": primary_key_column,
        "total_rows": total_rows,
        "unique_rows": total_rows - total_duplicate_rows,
        "duplicate_sets": duplicate_count,
        "duplicate_rows": total_duplicate_rows,
        "duplicate_percentage": round(duplicate_percentage, 2)
    }

print("Duplicate analysis function (excluding PK) loaded")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Dimension Tables Duplicate Analysis

# COMMAND ----------

# 1. DIM_Region
dim_region = spark.read.table("imdb_final_project.gold_DIM_Region")
region_dups = analyze_duplicates_excluding_pk(
    dim_region, 
    "gold_DIM_Region",
    "RegionKey",
    "Region dimension with codes and descriptions"
)

# COMMAND ----------

# 2. DIM_Language
dim_language = spark.read.table("imdb_final_project.gold_DIM_Language")
language_dups = analyze_duplicates_excluding_pk(
    dim_language, 
    "gold_DIM_Language",
    "LanguageKey",
    "Language dimension with codes and descriptions"
)

# COMMAND ----------

# 3. DIM_NAME
dim_name = spark.read.table("imdb_final_project.gold_DIM_NAME")
name_dups = analyze_duplicates_excluding_pk(
    dim_name, 
    "gold_DIM_NAME",
    "NameKey",
    "Person dimension - actors, directors, writers"
)

# COMMAND ----------

# 4. DIM_Title
dim_title = spark.read.table("imdb_final_project.gold_DIM_Title")
title_dups = analyze_duplicates_excluding_pk(
    dim_title, 
    "gold_DIM_Title",
    "TitleKey",
    "Title dimension - movies, TV shows, episodes"
)

# COMMAND ----------

# 5. DIM_Genre
dim_genre = spark.read.table("imdb_final_project.gold_DIM_Genre")
genre_dups = analyze_duplicates_excluding_pk(
    dim_genre, 
    "gold_DIM_Genre",
    "GenreKey",
    "Genre dimension"
)

# COMMAND ----------

# 6. DIM_Profession
dim_profession = spark.read.table("imdb_final_project.gold_DIM_Profession")
profession_dups = analyze_duplicates_excluding_pk(
    dim_profession, 
    "gold_DIM_Profession",
    "ProfessionKey",
    "Profession dimension"
)

# COMMAND ----------

# 7. DIM_Crew
dim_crew = spark.read.table("imdb_final_project.gold_DIM_Crew")
crew_dups = analyze_duplicates_excluding_pk(
    dim_crew, 
    "gold_DIM_Crew",
    "CrewKey",
    "Crew role dimension (director, writer)"
)

# COMMAND ----------

# 8. DIM_Principals
dim_principals = spark.read.table("imdb_final_project.gold_DIM_Principals")
principals_dups = analyze_duplicates_excluding_pk(
    dim_principals, 
    "gold_DIM_Principals",
    "PrincipalKey",
    "Principal cast and crew assignments"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Fact Tables Duplicate Analysis

# COMMAND ----------

# 9. FACT_Title_Ratings
fact_ratings = spark.read.table("imdb_final_project.gold_FACT_Title_Ratings")
ratings_dups = analyze_duplicates_excluding_pk(
    fact_ratings, 
    "gold_FACT_Title_Ratings",
    "RatingKey",
    "Title ratings and vote counts"
)

# COMMAND ----------

# 10. FACT_Episodes
fact_episodes = spark.read.table("imdb_final_project.gold_FACT_Episodes")
episodes_dups = analyze_duplicates_excluding_pk(
    fact_episodes, 
    "gold_FACT_Episodes",
    "EpisodeKey",
    "TV episode information"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Bridge Tables Duplicate Analysis

# COMMAND ----------

# 11. BRIDGE_TITLE_GENRE
bridge_genre = spark.read.table("imdb_final_project.gold_BRIDGE_TITLE_GENRE")
bridge_genre_dups = analyze_duplicates_excluding_pk(
    bridge_genre, 
    "gold_BRIDGE_TITLE_GENRE",
    "TitleGenreKey",
    "Title to Genre many-to-many relationship"
)

# COMMAND ----------

# 12. BRIDGE_PROFESSION
bridge_profession = spark.read.table("imdb_final_project.gold_BRIDGE_PROFESSION")
bridge_profession_dups = analyze_duplicates_excluding_pk(
    bridge_profession, 
    "gold_BRIDGE_PROFESSION",
    "titleProfessionKey",
    "Person to Profession many-to-many relationship"
)

# COMMAND ----------

# 13. Bridge_Title_Crew
bridge_crew = spark.read.table("imdb_final_project.gold_Bridge_Title_Crew")
bridge_crew_dups = analyze_duplicates_excluding_pk(
    bridge_crew, 
    "gold_Bridge_Title_Crew",
    "titleCrewKey",
    "Title to Crew member many-to-many relationship"
)

# COMMAND ----------

# 14. BRIDGE_Akas
bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")
bridge_akas_dups = analyze_duplicates_excluding_pk(
    bridge_akas, 
    "gold_BRIDGE_Akas",
    "TitleAkasKey",
    "Title alternate names with regional variations"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Consolidated Duplicate Summary

# COMMAND ----------

print("=" * 100)
print("CONSOLIDATED DUPLICATE ANALYSIS - ALL GOLD TABLES")
print("(Excluding Primary Keys from Comparison)")
print("=" * 100)

# Collect all results
all_duplicate_results = [
    region_dups,
    language_dups,
    name_dups,
    title_dups,
    genre_dups,
    profession_dups,
    crew_dups,
    principals_dups,
    ratings_dups,
    episodes_dups,
    bridge_genre_dups,
    bridge_profession_dups,
    bridge_crew_dups,
    bridge_akas_dups
]

# Filter out None values (from empty tables)
all_duplicate_results = [r for r in all_duplicate_results if r is not None]

# Create summary DataFrame
duplicate_summary_df = spark.createDataFrame(all_duplicate_results)

# Show tables with duplicates
tables_with_duplicates = duplicate_summary_df.filter(col("duplicate_rows") > 0)

if tables_with_duplicates.count() > 0:
    print("\n‚ö†Ô∏è  TABLES WITH DUPLICATES (Excluding Primary Keys):")
    tables_with_duplicates.orderBy(desc("duplicate_percentage")).show(50, truncate=False)
    
    print("\nüí° INTERPRETATION:")
    print("   These tables have rows where ALL columns are identical EXCEPT the primary key.")
    print("   This indicates potential data quality issues that should be investigated.")
else:
    print("\n‚úÖ NO DUPLICATES FOUND IN ANY TABLE!")

# Show overall summary
print("\nüìä OVERALL SUMMARY:")
duplicate_summary_df.select(
    "table_name",
    "primary_key",
    "total_rows",
    "unique_rows",
    "duplicate_sets",
    "duplicate_rows",
    "duplicate_percentage"
).orderBy("table_name").show(50, truncate=False)

# Statistics
total_tables = duplicate_summary_df.count()
tables_with_dups = tables_with_duplicates.count()
total_rows = duplicate_summary_df.agg(_sum("total_rows")).collect()[0][0]
total_dup_rows = duplicate_summary_df.agg(_sum("duplicate_rows")).collect()[0][0]
total_unique_rows = duplicate_summary_df.agg(_sum("unique_rows")).collect()[0][0]

print(f"\nüìà STATISTICS:")
print(f"   Total tables analyzed: {total_tables}")
print(f"   Tables with duplicates: {tables_with_dups}")
print(f"   Tables without duplicates: {total_tables - tables_with_dups}")
print(f"   Total rows across all tables: {total_rows:,}")
print(f"   Total unique rows: {total_unique_rows:,}")
print(f"   Total duplicate rows: {total_dup_rows:,}")
print(f"   Overall duplicate rate: {(total_dup_rows / total_rows * 100):.2f}%")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Deduplication Function (If Needed)

# COMMAND ----------

print("=" * 100)
print("DUPLICATE ANALYSIS COMPLETE!")
print("=" * 100)

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Pre-Deduplication Safety Check - Referential Integrity Analysis

# COMMAND ----------

from pyspark.sql.functions import col, count, countDistinct

print("=" * 100)
print("REFERENTIAL INTEGRITY SAFETY CHECK")
print("=" * 100)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 1. Check Impact of Deduplicating gold_BRIDGE_Akas

# COMMAND ----------

print("\n1. ANALYZING: gold_BRIDGE_Akas")
print("-" * 80)

bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")

print(f"Current total rows: {bridge_akas.count():,}")
print(f"Duplicates to remove: ~10,260")
print(f"Expected after dedup: ~{bridge_akas.count() - 10260:,}")

# Check if TitleAkasKey is used as FK anywhere (unlikely for bridge tables)
print("\n‚ö†Ô∏è  BRIDGE TABLES typically don't have other tables referencing them")
print("   They only reference dimension tables via their FKs")
print("   ‚úÖ SAFE TO DEDUPLICATE")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 2. Check Impact of Deduplicating gold_BRIDGE_PROFESSION

# COMMAND ----------

print("\n2. ANALYZING: gold_BRIDGE_PROFESSION")
print("-" * 80)

bridge_profession = spark.read.table("imdb_final_project.gold_BRIDGE_PROFESSION")

print(f"Current total rows: {bridge_profession.count():,}")
print(f"Duplicates to remove: ~20")
print(f"Expected after dedup: ~{bridge_profession.count() - 20:,}")

print("\n‚ö†Ô∏è  BRIDGE TABLES typically don't have other tables referencing them")
print("   ‚úÖ SAFE TO DEDUPLICATE")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3. Check Impact of Deduplicating gold_DIM_NAME (CRITICAL!)

# COMMAND ----------

print("\n3. ANALYZING: gold_DIM_NAME - ‚ö†Ô∏è  DIMENSION TABLE (HIGH RISK)")
print("-" * 80)

dim_name = spark.read.table("imdb_final_project.gold_DIM_NAME")
dim_principals = spark.read.table("imdb_final_project.gold_DIM_Principals")
bridge_profession = spark.read.table("imdb_final_project.gold_BRIDGE_PROFESSION")
bridge_crew = spark.read.table("imdb_final_project.gold_Bridge_Title_Crew")

print(f"Current DIM_NAME rows: {dim_name.count():,}")
print(f"Duplicates to remove: ~4")

# Check which NameKeys are duplicates
dedup_columns = ["NCONST", "PrimaryName", "BirthYear", "DeathYear"]
if "ModifiedDate" in dim_name.columns:
    dedup_columns.append("ModifiedDate")

from pyspark.sql.functions import md5, concat_ws, coalesce, lit

dim_name_with_hash = dim_name.withColumn(
    "row_hash", 
    md5(concat_ws("||", *[coalesce(col(c).cast("string"), lit("NULL")) for c in dedup_columns]))
)

duplicate_name_keys = (
    dim_name_with_hash
    .groupBy("row_hash")
    .agg(count("*").alias("dup_count"))
    .filter(col("dup_count") > 1)
    .join(dim_name_with_hash, "row_hash")
    .select("NameKey", "NCONST", "PrimaryName")
)

print("\nüîç Duplicate NameKeys that will be removed:")
duplicate_name_keys.show(20, truncate=False)

# Check if these NameKeys are referenced
duplicate_keys_list = [row.NameKey for row in duplicate_name_keys.select("NameKey").distinct().collect()]

print(f"\nüìä Checking references to duplicate NameKeys:")
print(f"   - Total duplicate NameKeys: {len(duplicate_keys_list)}")

if len(duplicate_keys_list) > 0:
    # Check DIM_Principals
    principals_refs = dim_principals.filter(col("NameKey").isin(duplicate_keys_list)).count()
    print(f"   - Referenced in DIM_Principals: {principals_refs}")
    
    # Check BRIDGE_PROFESSION
    profession_refs = bridge_profession.filter(col("NameKey").isin(duplicate_keys_list)).count()
    print(f"   - Referenced in BRIDGE_PROFESSION: {profession_refs}")
    
    # Check Bridge_Title_Crew
    crew_refs = bridge_crew.filter(col("NameKey").isin(duplicate_keys_list)).count()
    print(f"   - Referenced in Bridge_Title_Crew: {crew_refs}")
    
    total_refs = principals_refs + profession_refs + crew_refs
    
    if total_refs > 0:
        print(f"\n‚ö†Ô∏è  WARNING: {total_refs} rows in other tables reference duplicate NameKeys!")
        print("   ‚ùå NOT SAFE TO DEDUPLICATE without updating references!")
    else:
        print("\n‚úÖ No other tables reference these duplicate NameKeys")
        print("   ‚úÖ SAFE TO DEDUPLICATE")
else:
    print("‚úÖ SAFE TO DEDUPLICATE")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4. Check Impact of Deduplicating gold_DIM_Region (CRITICAL!)

# COMMAND ----------

print("\n4. ANALYZING: gold_DIM_Region - ‚ö†Ô∏è  DIMENSION TABLE (HIGH RISK)")
print("-" * 80)

dim_region = spark.read.table("imdb_final_project.gold_DIM_Region")
bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")

print(f"Current DIM_Region rows: {dim_region.count():,}")
print(f"Duplicates to remove: ~1")

# Check which RegionKeys are duplicates
dedup_columns = ["RegionCode", "RegionDescription"]

dim_region_with_hash = dim_region.withColumn(
    "row_hash", 
    md5(concat_ws("||", *[coalesce(col(c).cast("string"), lit("NULL")) for c in dedup_columns]))
)

duplicate_region_keys = (
    dim_region_with_hash
    .groupBy("row_hash")
    .agg(count("*").alias("dup_count"))
    .filter(col("dup_count") > 1)
    .join(dim_region_with_hash, "row_hash")
    .select("RegionKey", "RegionCode", "RegionDescription")
)

print("\nüîç Duplicate RegionKeys that will be removed:")
duplicate_region_keys.show(20, truncate=False)

# Check if these RegionKeys are referenced
duplicate_region_keys_list = [row.RegionKey for row in duplicate_region_keys.select("RegionKey").distinct().collect()]

print(f"\nüìä Checking references to duplicate RegionKeys:")
print(f"   - Total duplicate RegionKeys: {len(duplicate_region_keys_list)}")

if len(duplicate_region_keys_list) > 0:
    # Check BRIDGE_Akas (excluding -9999)
    akas_refs = bridge_akas.filter(
        (col("RegionKey").isin(duplicate_region_keys_list)) & 
        (col("RegionKey") != -9999)
    ).count()
    print(f"   - Referenced in BRIDGE_Akas: {akas_refs}")
    
    if akas_refs > 0:
        print(f"\n‚ö†Ô∏è  WARNING: {akas_refs} rows in BRIDGE_Akas reference duplicate RegionKeys!")
        print("   ‚ùå NOT SAFE TO DEDUPLICATE without updating references!")
    else:
        print("\n‚úÖ No other tables reference these duplicate RegionKeys")
        print("   ‚úÖ SAFE TO DEDUPLICATE")
else:
    print("‚úÖ SAFE TO DEDUPLICATE")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5. Check Impact of Deduplicating gold_DIM_Title (CRITICAL!)

# COMMAND ----------

print("\n5. ANALYZING: gold_DIM_Title - ‚ö†Ô∏è  DIMENSION TABLE (HIGHEST RISK)")
print("-" * 80)

dim_title = spark.read.table("imdb_final_project.gold_DIM_Title")
dim_principals = spark.read.table("imdb_final_project.gold_DIM_Principals")
fact_ratings = spark.read.table("imdb_final_project.gold_FACT_Title_Ratings")
fact_episodes = spark.read.table("imdb_final_project.gold_FACT_Episodes")
bridge_genre = spark.read.table("imdb_final_project.gold_BRIDGE_TITLE_GENRE")
bridge_crew = spark.read.table("imdb_final_project.gold_Bridge_Title_Crew")
bridge_akas = spark.read.table("imdb_final_project.gold_BRIDGE_Akas")

print(f"Current DIM_Title rows: {dim_title.count():,}")
print(f"Duplicates to remove: ~1")

# Check which TitleKeys are duplicates
dedup_columns = ["Tconst", "TitleType", "PrimaryTitle", "OriginalTitle", 
                 "IsAdult", "ReleaseYear", "RuntimeMinutes"]

# Add SCD columns if they exist
scd_columns = ["EffectiveDate", "EndDate", "IsCurrent", "CreatedDate", "ModifiedDate"]
for scd_col in scd_columns:
    if scd_col in dim_title.columns:
        dedup_columns.append(scd_col)

dim_title_with_hash = dim_title.withColumn(
    "row_hash", 
    md5(concat_ws("||", *[coalesce(col(c).cast("string"), lit("NULL")) for c in dedup_columns]))
)

duplicate_title_keys = (
    dim_title_with_hash
    .groupBy("row_hash")
    .agg(count("*").alias("dup_count"))
    .filter(col("dup_count") > 1)
    .join(dim_title_with_hash, "row_hash")
    .select("TitleKey", "Tconst", "PrimaryTitle")
)

print("\nüîç Duplicate TitleKeys that will be removed:")
duplicate_title_keys.show(20, truncate=False)

# Check if these TitleKeys are referenced
duplicate_title_keys_list = [row.TitleKey for row in duplicate_title_keys.select("TitleKey").distinct().collect()]

print(f"\nüìä Checking references to duplicate TitleKeys:")
print(f"   - Total duplicate TitleKeys: {len(duplicate_title_keys_list)}")

if len(duplicate_title_keys_list) > 0:
    # Check all tables that reference TitleKey
    principals_refs = dim_principals.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    ratings_refs = fact_ratings.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    episodes_refs = fact_episodes.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    genre_refs = bridge_genre.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    crew_refs = bridge_crew.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    akas_refs = bridge_akas.filter(col("TitleKey").isin(duplicate_title_keys_list)).count()
    
    print(f"   - Referenced in DIM_Principals: {principals_refs}")
    print(f"   - Referenced in FACT_Title_Ratings: {ratings_refs}")
    print(f"   - Referenced in FACT_Episodes: {episodes_refs}")
    print(f"   - Referenced in BRIDGE_TITLE_GENRE: {genre_refs}")
    print(f"   - Referenced in Bridge_Title_Crew: {crew_refs}")
    print(f"   - Referenced in BRIDGE_Akas: {akas_refs}")
    
    total_refs = principals_refs + ratings_refs + episodes_refs + genre_refs + crew_refs + akas_refs
    
    if total_refs > 0:
        print(f"\n‚ö†Ô∏è  WARNING: {total_refs} rows across multiple tables reference duplicate TitleKeys!")
        print("   ‚ùå NOT SAFE TO DEDUPLICATE without updating references!")
        print("\n   üîß SOLUTION: Must update all referencing tables to point to the kept TitleKey")
    else:
        print("\n‚úÖ No other tables reference these duplicate TitleKeys")
        print("   ‚úÖ SAFE TO DEDUPLICATE")
else:
    print("‚úÖ SAFE TO DEDUPLICATE")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Safety Check Summary

# COMMAND ----------

print("\n" + "=" * 100)
print("SAFETY CHECK SUMMARY")
print("=" * 100)

print("""
‚úÖ SAFE TO DEDUPLICATE (No References):
   - gold_BRIDGE_Akas (bridge table - not referenced by others)
   - gold_BRIDGE_PROFESSION (bridge table - not referenced by others)

‚ö†Ô∏è  REQUIRES CAREFUL HANDLING (Dimension Tables):
   - gold_DIM_NAME (may be referenced by Principals, Profession bridge, Crew bridge)
   - gold_DIM_Region (may be referenced by BRIDGE_Akas)
   - gold_DIM_Title (referenced by MANY tables - highest risk!)

üìã RECOMMENDATION:
   1. Run the checks above to see if duplicate keys are actually referenced
   2. If NO references found ‚Üí Safe to deduplicate
   3. If references found ‚Üí Need to update FK references before deduplication
""")

print("=" * 100)