In [1]:
from pyspark.sql.functions import current_timestamp, lit, trim, col

base_path = "abfss://5b3f2182-6f1d-4a71-bc81-4aa09677ce5d@onelake.dfs.fabric.microsoft.com/e90e14bb-e898-4e8f-b07e-38e648bf3ab0"

# ---- Read all four raw files ----
urbanisation = spark.read.option("header", True) \
    .csv(f"{base_path}/Files/Urban and rural population 1950-2050 - UN World Urbanization Prospects 2018.csv")

wid = spark.read.option("header", True) \
    .csv(f"{base_path}/Files/World Inequality Database (WID) - Pretax income.csv")

continents = spark.read.option("header", True) \
    .csv(f"{base_path}/Files/Countries Continents.csv")

who_regions = spark.read.option("header", True) \
    .csv(f"{base_path}/Files/OWID country to WHO regions.csv")

print("All files loaded successfully")

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 3, Finished, Available, Finished, False)

All files loaded successfully


In [2]:
# ---- Add metadata and save Bronze tables ----

def save_bronze(df, table_name, source_name):
    df.withColumn("_ingestion_timestamp", current_timestamp()) \
      .withColumn("_source", lit(source_name)) \
      .write.format("delta").mode("overwrite") \
      .saveAsTable(f"bronze_{table_name}")
    print(f"Saved bronze_{table_name} — {df.count()} rows")

save_bronze(urbanisation, "urbanisation", "UN_World_Urbanization_Prospects_2018")
save_bronze(wid, "wid_income", "World_Inequality_Database")
save_bronze(continents, "continents", "Countries_Continents")
save_bronze(who_regions, "who_regions", "OWID_WHO_Regions")

print("All Bronze tables saved successfully")

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 4, Finished, Available, Finished, False)

AnalysisException: [DELTA_INVALID_CHARACTERS_IN_COLUMN_NAMES] Found invalid character(s) among ' ,;{}()\n\t=' in the column names of your schema.
Invalid column names: Urban population 1950-2050 (UN World Urbanization Prospects 2018).
Please use other characters and try again.
Alternatively, enable Column Mapping to keep using these characters.

In [3]:
# Rename columns with special characters

urbanisation_clean = urbanisation \
    .withColumnRenamed(
        "Urban population 1950-2050 (UN World Urbanization Prospects 2018)",
        "urban_population"
    ) \
    .withColumnRenamed(
        "Rural population 1950-2050 (UN World Urbanization Prospects 2018)",
        "rural_population"
    ) \
    .withColumnRenamed("Entity", "country_name") \
    .withColumnRenamed("Year", "year")

wid_clean = wid \
    .withColumnRenamed("Entity", "country_name") \
    .withColumnRenamed("Year", "year") \
    .withColumnRenamed("Mean income", "mean_income") \
    .withColumnRenamed("Gini coefficient", "gini_coefficient") \
    .withColumnRenamed("P90-P100 - share of the top 10%", "top10_income_share") \
    .withColumnRenamed("P99-P100 - share of the top 1%", "top1_income_share") \
    .withColumnRenamed("P0-P50 - share of the bottom 50%", "bottom50_income_share") \
    .withColumnRenamed("P50 - income threshold (median)", "median_income")

continents_clean = continents \
    .withColumnRenamed("Entity", "country_name") \
    .withColumnRenamed("Year", "year") \
    .withColumnRenamed("Countries Continents", "continent")

who_clean = who_regions \
    .withColumnRenamed("Entity", "country_name") \
    .withColumnRenamed("Year", "year") \
    .withColumnRenamed("WHO region", "who_region")

print("Columns renamed successfully")
print("Urbanisation columns:", urbanisation_clean.columns)
print("WID columns:", wid_clean.columns)


StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 5, Finished, Available, Finished, False)

Columns renamed successfully
Urbanisation columns: ['country_name', 'year', 'urban_population', 'rural_population']
WID columns: ['country_name', 'year', 'gini_coefficient', 'Palma ratio (S90/S40 ratio)', 'S90/S10 ratio', 'S80/S20 ratio', 'S90/S50 ratio', 'P90/P10 ratio', 'P90/P50 ratio', 'P50/P10 ratio', 'P0-P10 - share of the bottom 10%', 'P0-P40 - share of the bottom 40%', 'bottom50_income_share', 'P10-P20 - share of national income', 'P20-P30 - share of national income', 'P30-P40 - share of national income', 'P40-P50 - share of national income', 'P50-P60 - share of national income', 'P50-P90 - share of the middle 40%', 'P60-P70 - share of national income', 'P70-P80 - share of national income', 'P80-P90 - share of national income', 'top10_income_share', 'P99.999-P100 - share of the top 0.001%', 'P99.99-P100 - share of the top 0.01%', 'P99.9-P100 - share of the top 0.1%', 'top1_income_share', 'mean_income', 'P0-P10 - mean income of the bottom 10%', 'P10-P20 - mean income', 'P20-P30 -

In [4]:
# Save all four as Bronze Delta tables
save_bronze(urbanisation_clean, "urbanisation", "UN_World_Urbanization_Prospects_2018")
save_bronze(wid_clean, "wid_income", "World_Inequality_Database")
save_bronze(continents_clean, "continents", "Countries_Continents")
save_bronze(who_clean, "who_regions", "OWID_WHO_Regions")

print("All Bronze tables saved successfully")
```

You should see four lines like this:
```
Saved bronze_urbanisation — 27573 rows
Saved bronze_wid_income — 7078 rows
Saved bronze_continents — 285 rows
Saved bronze_who_regions — 194 rows
All Bronze tables saved successfully

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 6, Finished, Available, Finished, False)

SyntaxError: invalid character '—' (U+2014) (3652667379.py, line 12)

In [5]:
# Save all four as Bronze Delta tables
save_bronze(urbanisation_clean, "urbanisation", "UN_World_Urbanization_Prospects_2018")
save_bronze(wid_clean, "wid_income", "World_Inequality_Database")
save_bronze(continents_clean, "continents", "Countries_Continents")
save_bronze(who_clean, "who_regions", "OWID_WHO_Regions")

print("All Bronze tables saved successfully")
```

You should see four lines like this:
```
Saved bronze_urbanisation — 27573 rows
Saved bronze_wid_income — 7078 rows
Saved bronze_continents — 285 rows
Saved bronze_who_regions — 194 rows
All Bronze tables saved successfully

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 7, Finished, Available, Finished, False)

SyntaxError: invalid character '—' (U+2014) (3652667379.py, line 12)

In [6]:
save_bronze(urbanisation_clean, "urbanisation", "UN_World_Urbanization_Prospects_2018")
save_bronze(wid_clean, "wid_income", "World_Inequality_Database")
save_bronze(continents_clean, "continents", "Countries_Continents")
save_bronze(who_clean, "who_regions", "OWID_WHO_Regions")

print("All Bronze tables saved successfully")

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 8, Finished, Available, Finished, False)

Saved bronze_urbanisation — 27573 rows


AnalysisException: [DELTA_INVALID_CHARACTERS_IN_COLUMN_NAMES] Found invalid character(s) among ' ,;{}()\n\t=' in the column names of your schema.
Invalid column names: Palma ratio (S90/S40 ratio).
Please use other characters and try again.
Alternatively, enable Column Mapping to keep using these characters.

In [7]:
import re

def clean_column_names(df):
    for col_name in df.columns:
        new_name = col_name \
            .replace(" ", "_") \
            .replace("(", "") \
            .replace(")", "") \
            .replace("/", "_") \
            .replace("-", "_") \
            .replace("%", "pct") \
            .replace(",", "") \
            .replace(".", "")
        # Remove double underscores
        new_name = re.sub(r'_+', '_', new_name)
        new_name = new_name.strip("_").lower()
        df = df.withColumnRenamed(col_name, new_name)
    return df

# Apply to all four datasets
urbanisation_clean = clean_column_names(urbanisation_clean)
wid_clean = clean_column_names(wid_clean)
continents_clean = clean_column_names(continents_clean)
who_clean = clean_column_names(who_clean)

print("All column names cleaned")
print("\nWID columns:")
for c in wid_clean.columns:
    print(" ", c)

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 9, Finished, Available, Finished, False)

All column names cleaned

WID columns:
  country_name
  year
  gini_coefficient
  palma_ratio_s90_s40_ratio
  s90_s10_ratio
  s80_s20_ratio
  s90_s50_ratio
  p90_p10_ratio
  p90_p50_ratio
  p50_p10_ratio
  p0_p10_share_of_the_bottom_10pct
  p0_p40_share_of_the_bottom_40pct
  bottom50_income_share
  p10_p20_share_of_national_income
  p20_p30_share_of_national_income
  p30_p40_share_of_national_income
  p40_p50_share_of_national_income
  p50_p60_share_of_national_income
  p50_p90_share_of_the_middle_40pct
  p60_p70_share_of_national_income
  p70_p80_share_of_national_income
  p80_p90_share_of_national_income
  top10_income_share
  p99999_p100_share_of_the_top_0001pct
  p9999_p100_share_of_the_top_001pct
  p999_p100_share_of_the_top_01pct
  top1_income_share
  mean_income
  p0_p10_mean_income_of_the_bottom_10pct
  p10_p20_mean_income
  p20_p30_mean_income
  p30_p40_mean_income
  p40_p50_mean_income
  p50_p60_mean_income
  p60_p70_mean_income
  p70_p80_mean_income
  p80_p90_mean_income
  p

In [8]:
save_bronze(urbanisation_clean, "urbanisation", "UN_World_Urbanization_Prospects_2018")
save_bronze(wid_clean, "wid_income", "World_Inequality_Database")
save_bronze(continents_clean, "continents", "Countries_Continents")
save_bronze(who_clean, "who_regions", "OWID_WHO_Regions")

print("All Bronze tables saved successfully")

StatementMeta(, 60a9325c-0938-43aa-87b7-e8d843b235ae, 10, Finished, Available, Finished, False)

Saved bronze_urbanisation — 27573 rows
Saved bronze_wid_income — 7078 rows
Saved bronze_continents — 285 rows
Saved bronze_who_regions — 194 rows
All Bronze tables saved successfully
