In [None]:
from pyspark.sql.functions import trim, col, when, lit

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 10, Finished, Available, Finished, False)

In [None]:
regions_to_remove = [
    "World", "South-Central Asia", "Eastern Europe", "Europe",
    "High-income countries", "Africa", "Southern Europe",
    "Least developed countries", "South-Eastern Asia",
    "Sub-Saharan Africa", "Central Asia", "Central America",
    "Less developed regions", "Northern America", "Northern Europe",
    "Caribbean", "Eastern Africa", "Western Africa",
    "Low-income countries", "South America", "Polynesia",
    "Southern Asia", "Upper-middle-income countries",
    "Northern Africa", "Melanesia", "Southern Africa",
    "Western Europe", "Western Asia", "Oceania",
    "Eastern Asia", "More developed regions",
    "Lower-middle-income countries", "Middle-income countries",
    "Middle Africa", "Asia", "Latin America and the Caribbean",
    "Australia/New Zealand", "Micronesia (subregion)"
]

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 11, Finished, Available, Finished, False)

In [None]:
urbanisation_countries_fixed = urbanisation \
    .select(trim(col("Entity")).alias("country_name")) \
    .withColumn("country_name",
        when(col("country_name") == "Swaziland", "Eswatini")
        .when(col("country_name") == "Macedonia", "North Macedonia")
        .when(col("country_name") == "Czech Republic", "Czechia")
        .otherwise(col("country_name"))
    ) \
    .distinct() \
    .filter(~col("country_name").isin(regions_to_remove))

print("Countries after filtering:", urbanisation_countries_fixed.count())

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 12, Finished, Available, Finished, False)

Countries after filtering: 235


In [None]:
continents_fixed = continents \
    .select(
        trim(col("Entity")).alias("country_name"),
        col("Countries Continents").alias("continent")
    ) \
    .withColumn("country_name",
        when(col("country_name") == "Swaziland", "Eswatini")
        .when(col("country_name") == "Macedonia", "North Macedonia")
        .when(col("country_name") == "Czech Republic", "Czechia")
        .otherwise(col("country_name"))
    ) \
    .distinct()

print("Continent mappings:", continents_fixed.count())

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 13, Finished, Available, Finished, False)

Continent mappings: 285


In [None]:
who_fixed = who_regions \
    .select(
        trim(col("Entity")).alias("country_name"),
        col("WHO region").alias("who_region")
    ) \
    .withColumn("country_name",
        when(col("country_name") == "Swaziland", "Eswatini")
        .when(col("country_name") == "Macedonia", "North Macedonia")
        .when(col("country_name") == "Czech Republic", "Czechia")
        .otherwise(col("country_name"))
    ) \
    .distinct()

print("WHO region mappings:", who_fixed.count())

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 14, Finished, Available, Finished, False)

WHO region mappings: 194


In [None]:
country_dim_fixed = urbanisation_countries_fixed \
    .join(continents_fixed, on="country_name", how="left") \
    .join(who_fixed, on="country_name", how="left")

country_dim_fixed = country_dim_fixed \
    .withColumn("continent",
        when(col("country_name") == "Caribbean Netherlands", "North America")
        .when(col("country_name") == "Micronesia", "Oceania")
        .otherwise(col("continent"))
    ) \
    .withColumn("who_region",
        when(col("country_name") == "Caribbean Netherlands", "Americas")
        .when(col("country_name") == "Micronesia", "Western Pacific")
        .otherwise(col("who_region"))
    )

country_dim_fixed.write.format("delta").mode("overwrite") \
    .saveAsTable("country_dimension")

print("Saved successfully")

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 15, Finished, Available, Finished, False)

Saved successfully


In [None]:
total = country_dim_fixed.count()
has_continent = country_dim_fixed.filter(col("continent").isNotNull()).count()
has_who = country_dim_fixed.filter(col("who_region").isNotNull()).count()
missing = country_dim_fixed.filter(col("continent").isNull()).count()

print("Total countries:", total)
print("With continent:", has_continent)
print("With WHO region:", has_who)
print("Still missing:", missing)

if missing > 0:
    country_dim_fixed.filter(col("continent").isNull()) \
        .select("country_name").show(50, truncate=False

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 16, Finished, Available, Finished, False)

SyntaxError: incomplete input (3332100629.py, line 13)

In [None]:
total = country_dim_fixed.count()
has_continent = country_dim_fixed.filter(col("continent").isNotNull()).count()
has_who = country_dim_fixed.filter(col("who_region").isNotNull()).count()
missing = country_dim_fixed.filter(col("continent").isNull()).count()

print("Total countries:", total)
print("With continent:", has_continent)
print("With WHO region:", has_who)
print("Still missing:", missing)

if missing > 0:
    country_dim_fixed.filter(col("continent").isNull()) \
        .select("country_name").show(50, truncate=False)

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 17, Finished, Available, Finished, False)

Total countries: 235
With continent: 233
With WHO region: 195
Still missing: 2


+-----------------------------------------------------------+
|country_name                                               |
+-----------------------------------------------------------+
|Less developed regions, excluding least developed countries|
|Less developed regions, excluding China                    |
+-----------------------------------------------------------+



In [None]:
# Remove the last 2 regional aggregates
country_dim_final = country_dim_fixed \
    .filter(col("country_name") != "Less developed regions, excluding least developed countries") \
    .filter(col("country_name") != "Less developed regions, excluding China")

# Save final version
country_dim_final.write.format("delta").mode("overwrite") \
    .saveAsTable("country_dimension")

# Final check
total = country_dim_final.count()
has_continent = country_dim_final.filter(col("continent").isNotNull()).count()
missing = country_dim_final.filter(col("continent").isNull()).count()

print("Total countries:", total)
print("With continent:", has_continent)
print("Missing continent:", missing)
print("Country dimension complete and ready to use")

StatementMeta(, b5f40202-c66b-4fa8-bf93-e133bd0d6c99, 18, Finished, Available, Finished, False)

Total countries: 233
With continent: 233
Missing continent: 0
Country dimension complete and ready to use
