In [1]:
from pyspark.sql.functions import (
    col, round as spark_round, sum as spark_sum,
    avg, lag, when, lit, ntile, rank
)
from pyspark.sql.window import Window

# ---- Load Silver table ----
silver = spark.table("silver_urbanisation")

print("Silver table loaded successfully")
print("Rows:", silver.count())

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 3, Finished, Available, Finished, False)

Silver table loaded successfully
Rows: 23735


In [2]:
# ---- Gold Table 1: Urbanisation trends by continent ----

gold_continent_trends = silver \
    .filter(col("continent").isNotNull()) \
    .groupBy("continent", "year", "data_type") \
    .agg(
        spark_sum("urban_population").alias("total_urban_population"),
        spark_sum("rural_population").alias("total_rural_population"),
        spark_sum("total_population").alias("total_population"),
        avg("urban_pct").alias("avg_urban_pct"),
        avg("mean_income").alias("avg_mean_income"),
        avg("gini_coefficient").alias("avg_gini")
    ) \
    .withColumn("avg_urban_pct", spark_round(col("avg_urban_pct"), 2)) \
    .withColumn("avg_mean_income", spark_round(col("avg_mean_income"), 2)) \
    .withColumn("avg_gini", spark_round(col("avg_gini"), 4)) \
    .orderBy("continent", "year")

gold_continent_trends.write.format("delta").mode("overwrite") \
    .saveAsTable("gold_continent_trends")

print("Gold continent trends saved")
print("Rows:", gold_continent_trends.count())
gold_continent_trends.show(5, truncate=False)

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 4, Finished, Available, Finished, False)

Gold continent trends saved
Rows: 606
+---------+----+----------+----------------------+----------------------+----------------+-------------+---------------+--------+
|continent|year|data_type |total_urban_population|total_rural_population|total_population|avg_urban_pct|avg_mean_income|avg_gini|
+---------+----+----------+----------------------+----------------------+----------------+-------------+---------------+--------+
|Africa   |1950|historical|32653000              |195743000             |228396000       |13.69        |NULL           |NULL    |
|Africa   |1951|historical|34204000              |198793000             |232997000       |14.02        |NULL           |NULL    |
|Africa   |1952|historical|35853000              |201979000             |237832000       |14.4         |NULL           |NULL    |
|Africa   |1953|historical|37598000              |205290000             |242888000       |14.78        |NULL           |NULL    |
|Africa   |1954|historical|39448000              |20

In [3]:
# ---- Gold Table 2: Country year on year growth rates ----

window_spec = Window.partitionBy("country_name").orderBy("year")

gold_growth_rates = silver \
    .filter(col("data_type") == "historical") \
    .withColumn("prev_urban_pop", lag("urban_population", 1).over(window_spec)) \
    .withColumn("prev_total_pop", lag("total_population", 1).over(window_spec)) \
    .withColumn("prev_mean_income", lag("mean_income", 1).over(window_spec)) \
    .withColumn("urban_growth_rate",
        spark_round(
            ((col("urban_population") - col("prev_urban_pop")) / col("prev_urban_pop")) * 100, 2)
    ) \
    .withColumn("population_growth_rate",
        spark_round(
            ((col("total_population") - col("prev_total_pop")) / col("prev_total_pop")) * 100, 2)
    ) \
    .withColumn("income_growth_rate",
        spark_round(
            ((col("mean_income") - col("prev_mean_income")) / col("prev_mean_income")) * 100, 2)
    ) \
    .filter(col("urban_growth_rate").isNotNull()) \
    .select(
        "country_name",
        "continent",
        "who_region",
        "year",
        "urban_population",
        "total_population",
        "urban_pct",
        "mean_income",
        "urban_growth_rate",
        "population_growth_r

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 5, Finished, Available, Finished, False)

SyntaxError: unterminated string literal (detected at line 33) (3662014316.py, line 33)

In [4]:
# Define window and calculate growth rates
window_spec = Window.partitionBy("country_name").orderBy("year")

gold_growth_rates = silver \
    .filter(col("data_type") == "historical") \
    .withColumn("prev_urban_pop", lag("urban_population", 1).over(window_spec)) \
    .withColumn("prev_total_pop", lag("total_population", 1).over(window_spec)) \
    .withColumn("prev_mean_income", lag("mean_income", 1).over(window_spec)) \
    .withColumn("urban_growth_rate",
        spark_round(
            ((col("urban_population") - col("prev_urban_pop")) / col("prev_urban_pop")) * 100, 2)
    ) \
    .withColumn("population_growth_rate",
        spark_round(
            ((col("total_population") - col("prev_total_pop")) / col("prev_total_pop")) * 100, 2)
    ) \
    .withColumn("income_growth_rate",
        spark_round(
            ((col("mean_income") - col("prev_mean_income")) / col("prev_mean_income")) * 100, 2)
    ) \
    .filter(col("urban_growth_rate").isNotNull())

print("Growth rates calculated:", gold_growth_rates.count())

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 6, Finished, Available, Finished, False)

Growth rates calculated: 15824


In [5]:
# Select final columns and save
gold_growth_rates = gold_growth_rates.select(
    "country_name",
    "continent",
    "who_region",
    "year",
    "urban_population",
    "total_population",
    "urban_pct",
    "mean_income",
    "urban_growth_rate",
    "population_growth_rate",
    "income_growth_rate"
)

gold_growth_rates.write.format("delta").mode("overwrite") \
    .saveAsTable("gold_growth_rates")

print("Gold growth rates saved")
print("Rows:", gold_growth_rates.count())
gold_growth_rates.show(5, truncate=False)

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 7, Finished, Available, Finished, False)

Gold growth rates saved
Rows: 15824
+------------+---------+---------------------+----+----------------+----------------+---------+-----------+-----------------+----------------------+------------------+
|country_name|continent|who_region           |year|urban_population|total_population|urban_pct|mean_income|urban_growth_rate|population_growth_rate|income_growth_rate|
+------------+---------+---------------------+----+----------------+----------------+---------+-----------+-----------------+----------------------+------------------+
|Afghanistan |Asia     |Eastern Mediterranean|1951|487000          |7840000         |6.21     |NULL       |4.73             |1.14                  |NULL              |
|Afghanistan |Asia     |Eastern Mediterranean|1952|510000          |7935000         |6.43     |NULL       |4.72             |1.21                  |NULL              |
|Afghanistan |Asia     |Eastern Mediterranean|1953|534000          |8039000         |6.64     |NULL       |4.71             

In [6]:
    # ---- Gold Table 3: Income vs Urbanisation ----

gold_income_vs_urban = silver \
    .filter(col("mean_income").isNotNull()) \
    .filter(col("urban_pct").isNotNull()) \
    .select(
        "country_name",
        "continent",
        "who_region",
        "year",
        "data_type",
        "urban_pct",
        "rural_pct",
        "total_population",
        "mean_income",
        "median_income",
        "gini_coefficient",
        "top1_income_share",
        "top10_income_share",
        "bottom50_income_share"
    )

gold_income_vs_urban.write.format("delta").mode("overwrite") \
    .saveAsTable("gold_income_vs_urbanisation")

print("Gold income vs urbanisation saved")
print("Rows:", gold_income_vs_urban.count())
gold_income_vs_urban.show(5, truncate=False)

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 8, Finished, Available, Finished, False)

Gold income vs urbanisation saved
Rows: 4158
+------------+---------+---------------------+----+----------+---------+---------+----------------+-----------------+-----------------+-----------------+------------------+------------------+---------------------+
|country_name|continent|who_region           |year|data_type |urban_pct|rural_pct|total_population|mean_income      |median_income    |gini_coefficient |top1_income_share |top10_income_share|bottom50_income_share|
+------------+---------+---------------------+----+----------+---------+---------+----------------+-----------------+-----------------+-----------------+------------------+------------------+---------------------+
|Afghanistan |Asia     |Eastern Mediterranean|2008|historical|23.32    |76.68    |27294000        |3553.318948336138|2189.86421366986 |0.524523877237829|16.220000000000002|41.94             |17.16                |
|Afghanistan |Asia     |Eastern Mediterranean|2012|historical|24.16    |75.84    |30697000        |

In [7]:
# ---- Gold Table 4: Market Opportunity Score ----

window_income = Window.partitionBy("year").orderBy("mean_income")
window_urban = Window.partitionBy("year").orderBy("urban_pct")
window_pop = Window.partitionBy("year").orderBy("total_population")

gold_opportunity = silver \
    .filter(col("mean_income").isNotNull()) \
    .filter(col("urban_pct").isNotNull()) \
    .filter(col("continent").isNotNull()) \
    .withColumn("income_rank", ntile(100).over(window_income)) \
    .withColumn("urban_rank", ntile(100).over(window_urban)) \
    .withColumn("population_rank", ntile(100).over(window_pop)) \
    .withColumn("opportunity_score",
        spark_round(
            (col("income_rank") * 0.4) +
            (col("urban_rank") * 0.4) +
            (col("population_rank") * 0.2), 2
        )
    ) \
    .withColumn("opportunity_tier",
        when(col("opportunity_score") >= 75, "High Opportunity")
        .when(col("opportunity_score") >= 50, "Medium Opportunity")

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 9, Finished, Available, Finished, False)

SyntaxError: incomplete input (1442020316.py, line 23)

In [8]:
# ---- Gold Table 4: Market Opportunity Score ----

window_income = Window.partitionBy("year").orderBy("mean_income")
window_urban = Window.partitionBy("year").orderBy("urban_pct")
window_pop = Window.partitionBy("year").orderBy("total_population")

gold_opportunity = silver \
    .filter(col("mean_income").isNotNull()) \
    .filter(col("urban_pct").isNotNull()) \
    .filter(col("continent").isNotNull()) \
    .withColumn("income_rank", ntile(100).over(window_income)) \
    .withColumn("urban_rank", ntile(100).over(window_urban)) \
    .withColumn("population_rank", ntile(100).over(window_pop)) \
    .withColumn("opportunity_score",
        spark_round(
            (col("income_rank") * 0.4) +
            (col("urban_rank") * 0.4) +
            (col("population_rank") * 0.2), 2
        )
    ) \
    .withColumn("opportunity_tier",
        when(col("opportunity_score") >= 75, "High Opportunity")
        .when(col("opportunity_score") >= 50, "Medium Opportunity")
        .when(col("opportunity_score") >= 25, "Low Opportunity")
        .otherwise("Emerging")
    ) \
    .select(
        "country_name",
        "continent",
        "who_region",
        "year",
        "urban_pct",
        "total_population",
        "mean_income",
        "gini_coefficient",
        "income_rank",
        "urban_rank",
        "population_rank",
        "opportunity_score",
        "opportunity_tier"
    )

gold_opportunity.write.format("delta").mode("overwrite") \
    .saveAsTable("gold_market_opportunity")

print("Gold market opportunity saved")
print("Rows:", gold_opportunity.count())
gold_opportunity.show(5, truncate=False)

StatementMeta(, 2d9e7e24-4924-4656-8098-636c276fb064, 10, Finished, Available, Finished, False)

Gold market opportunity saved
Rows: 4158
+-------------+-------------+---------------+----+---------+----------------+-----------------+-----------------+-----------+----------+---------------+-----------------+----------------+
|country_name |continent    |who_region     |year|urban_pct|total_population|mean_income      |gini_coefficient |income_rank|urban_rank|population_rank|opportunity_score|opportunity_tier|
+-------------+-------------+---------------+----+---------+----------------+-----------------+-----------------+-----------+----------+---------------+-----------------+----------------+
|New Zealand  |Oceania      |Western Pacific|1950|72.54    |1908000         |25316.57830961436|0.565495051714954|5          |4         |1              |3.8              |Emerging        |
|Australia    |Oceania      |Western Pacific|1950|77.0     |8178000         |20523.41589346674|0.492502534300147|3          |5         |2              |3.6              |Emerging        |
|Canada       |Nort