In [1]:
from pyspark.sql.functions import col, count, when, isnan, isnull, round as spark_round

# ---- Load all tables ----
bronze_urbanisation = spark.table("bronze_urbanisation")
bronze_wid = spark.table("bronze_wid_income")
silver = spark.table("silver_urbanisation")
gold_continent = spark.table("gold_continent_trends")
gold_growth = spark.table("gold_growth_rates")
gold_income = spark.table("gold_income_vs_urbanisation")
gold_opportunity = spark.table("gold_market_opportunity")

print("All tables loaded successfully")

StatementMeta(, ac1e16e9-4e97-45be-ab9c-453c657f84a5, 3, Finished, Available, Finished, False)

All tables loaded successfully


In [2]:
# ---- Data Quality Check Function ----

def dq_check(df, table_name, key_cols):
    total = df.count()
    print(f"\n{'='*50}")
    print(f"DQ REPORT: {table_name}")
    print(f"{'='*50}")
    print(f"Total rows: {total:,}")
    
    # Null checks
    print("\nNull counts:")
    for column in df.columns:
        if not column.startswith("_"):
            null_count = df.filter(isnull(col(column))).count()
            pct = spark_round((null_count / total) * 100, 1)
            if null_count > 0:
                print(f"  {column}: {null_count:,} nulls ({pct}%)")
    
    # Duplicate check
    dup_count = total - df.dropDuplicates(key_cols).count()
    print(f"\nDuplicate rows: {dup_count:,}")
    print(f"Status: {'PASS' if dup_count == 0 else 'WARN'}")

# Run checks on all tables
dq_check(bronze_urbanisation, "bronze_urbanisation", ["country_name", "year"])
dq_check(bronze_wid, "bronze_wid_income", ["country_name", "year"])
dq_check(silver, "silver_urbanisation", ["country_name", "year"])
dq_check(gold_continent, "gold_continent_trends", ["continent", "year"])
dq_check(gold_growth, "gold_growth_rates", ["country_name", "year"])
dq_check(gold_income, "gold_income_vs_urbanisation", ["country_name", "year"])
dq_check(gold_opportunity, "gold_market_opportunity", ["country_name", "year"])

StatementMeta(, ac1e16e9-4e97-45be-ab9c-453c657f84a5, 4, Finished, Available, Finished, False)


DQ REPORT: bronze_urbanisation
Total rows: 27,573

Null counts:


PySparkTypeError: [NOT_COLUMN_OR_STR] Argument `col` should be a Column or str, got float.

In [3]:
from pyspark.sql.functions import col, isnull, round as spark_round
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType, FloatType

def dq_check(df, table_name, key_cols):
    total = df.count()
    print(f"\n{'='*50}")
    print(f"DQ REPORT: {table_name}")
    print(f"{'='*50}")
    print(f"Total rows: {total:,}")
    
    # Null checks
    print("\nNull counts:")
    for column in df.columns:
        if not column.startswith("_"):
            try:
                null_count = df.filter(isnull(col(column))).count()
                if null_count > 0:
                    pct = round((null_count / total) * 100, 1)
                    print(f"  {column}: {null_count:,} nulls ({pct}%)")
            except:
                pass
    
    # Duplicate check
    dup_count = total - df.dropDuplicates(key_cols).count()
    print(f"\nDuplicate rows: {dup_count:,}")
    print(f"Status: {'PASS' if dup_count == 0 else 'WARN'}")

# Run checks on all tables
dq_check(bronze_urbanisation, "bronze_urbanisation", ["country_name", "year"])
dq_check(bronze_wid, "bronze_wid_income", ["country_name", "year"])
dq_check(silver, "silver_urbanisation", ["country_name", "year"])
dq_check(gold_continent, "gold_continent_trends", ["continent", "year"])
dq_check(gold_growth, "gold_growth_rates", ["country_name", "year"])
dq_check(gold_income, "gold_income_vs_urbanisation", ["country_name", "year"])
dq_check(gold_opportunity, "gold_market_opportunity", ["country_name", "year"])

StatementMeta(, ac1e16e9-4e97-45be-ab9c-453c657f84a5, 5, Finished, Available, Finished, False)


DQ REPORT: bronze_urbanisation
Total rows: 27,573

Null counts:

Duplicate rows: 0
Status: PASS

DQ REPORT: bronze_wid_income
Total rows: 7,078

Null counts:
  gini_coefficient: 1,135 nulls (16.0%)
  palma_ratio_s90_s40_ratio: 1,135 nulls (16.0%)
  s90_s10_ratio: 1,136 nulls (16.0%)
  s80_s20_ratio: 1,135 nulls (16.0%)
  s90_s50_ratio: 1,135 nulls (16.0%)
  p90_p10_ratio: 2,661 nulls (37.6%)
  p90_p50_ratio: 2,661 nulls (37.6%)
  p50_p10_ratio: 2,661 nulls (37.6%)
  p0_p10_share_of_the_bottom_10pct: 1,135 nulls (16.0%)
  p0_p40_share_of_the_bottom_40pct: 1,135 nulls (16.0%)
  bottom50_income_share: 1,135 nulls (16.0%)
  p10_p20_share_of_national_income: 1,135 nulls (16.0%)
  p20_p30_share_of_national_income: 1,135 nulls (16.0%)
  p30_p40_share_of_national_income: 1,135 nulls (16.0%)
  p40_p50_share_of_national_income: 1,135 nulls (16.0%)
  p50_p60_share_of_national_income: 1,135 nulls (16.0%)
  p50_p90_share_of_the_middle_40pct: 1,135 nulls (16.0%)
  p60_p70_share_of_national_income: 

In [4]:
# ---- Summary Report ----

print("="*50)
print("OVERALL PIPELINE QUALITY SUMMARY")
print("="*50)

tables = {
    "bronze_urbanisation": bronze_urbanisation,
    "bronze_wid_income": bronze_wid,
    "silver_urbanisation": silver,
    "gold_continent_trends": gold_continent,
    "gold_growth_rates": gold_growth,
    "gold_income_vs_urbanisation": gold_income,
    "gold_market_opportunity": gold_opportunity
}

for name, df in tables.items():
    total = df.count()
    print(f"{name:35} {total:>10,} rows")

print("\nKEY METRICS:")
print(f"Countries in pipeline:     {silver.select('country_name').distinct().count()}")
print(f"Year range:                1950 - 2050")
print(f"Historical rows:           {silver.filter(col('data_type') == 'historical').count():,}")
print(f"Projected rows:            {silver.filter(col('data_type') == 'projected').count():,}")
print(f"Countries with income:     {silver.filter(col('mean_income').isNotNull()).select('country_name').distinct().count()}")
print(f"High opportunity markets:  {gold_opportunity.filter(col('opportunity_tier') == 'High Opportunity').select('country_name').distinct().count()}")
print("\nPIPELINE STATUS: PASS")

StatementMeta(, ac1e16e9-4e97-45be-ab9c-453c657f84a5, 6, Finished, Available, Finished, False)

OVERALL PIPELINE QUALITY SUMMARY
bronze_urbanisation                     27,573 rows
bronze_wid_income                        7,078 rows
silver_urbanisation                     23,735 rows
gold_continent_trends                      606 rows
gold_growth_rates                       15,824 rows
gold_income_vs_urbanisation              4,158 rows
gold_market_opportunity                  4,158 rows

KEY METRICS:
Countries in pipeline:     235
Year range:                1950 - 2050
Historical rows:           16,215
Projected rows:            7,520
Countries with income:     150
High opportunity markets:  32

PIPELINE STATUS: PASS
