In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from functools import reduce
import seaborn as sns; sns.set(color_codes=True)
import statsmodels.api as sm 
from stargazer.stargazer import Stargazer
from IPython.display import HTML, display
import re
import os 
import pycountry
import unicodedata
import zipfile
import io
import tabula
import jpype
from tabula.io import read_pdf
from docx import Document
from pdf2docx import Converter
from io import BytesIO
from io import StringIO
import requests
import tempfile
from docx import Document
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.iolib.summary2 import summary_col
import warnings
from statsmodels.tools.sm_exceptions import IterationLimitWarning
from linearmodels.panel import PanelOLS
from statsmodels.sandbox.regression.gmm import IV2SLS
import patsy
from scipy import stats
from statsmodels.robust.norms import HuberT
from sklearn.decomposition import PCA
from matplotlib.lines import Line2D

## Get Vaccine data ##

Source: https://www.unicef.org/supply/covid-19-market-dashboard + 
https://unicef-my.sharepoint.com/:x:/g/personal/tadino_unicef_org/EV6ICM0-Ve9EkJ-0uYvUOG0BKi15jhor_8C_m1WsFLsPTA?rtime=U7TolWu53Ug

In [2]:
vaccine_donations_raw = pd.read_csv(r'Vaccine Donations(Vaccine Donations 13052024).csv', encoding='ISO-8859-1')

In [3]:
vaccine_donations_raw

Unnamed: 0,Donor,Recipient,Mechanism,Vaccine name,Vaccine developer,Manufacturer,Donated doses,Facilitated doses,Doses delivered
0,United States of America,Bangladesh,Through COVAX,Comirnaty,Pfizer/BioNTech,Pfizer/BioNTech,60315710.0,48075300.0,108391010.0
1,United States of America,Pakistan,Through COVAX,Comirnaty,Pfizer/BioNTech,Pfizer/BioNTech,52915840.0,20586150.0,73501990.0
2,United States of America,Viet Nam,Through COVAX,Comirnaty,Pfizer/BioNTech,Pfizer/BioNTech,18682560.0,16168230.0,34850790.0
3,United States of America,Indonesia,Through COVAX,Comirnaty,Pfizer/BioNTech,Pfizer/BioNTech,19282770.0,13499460.0,32782230.0
4,United States of America,Egypt,Through COVAX,Comirnaty,Pfizer/BioNTech,Pfizer/BioNTech,18491850.0,9158760.0,27650610.0
...,...,...,...,...,...,...,...,...,...
1544,Sweden,Ukraine,Bilateral,Comirnaty,Pfizer BioNTech,Pfizer/BioNTech,500000.0,,
1545,Turkey,Gabon,Bilateral,Coronavac,Sinovac,Sinovac,100000.0,,
1546,United States of America,Bosnia and Herzegovina,Bilateral,Comirnaty,Pfizer BioNTech,Pfizer/BioNTech,500000.0,,
1547,United States of America,Namibia,Bilateral,Ad26.COV 2.S,Janssen Pharmaceuticals,Janssen,168000.0,,


In [4]:
def aggregate_vaccine_doses(file_path):
    # Read the CSV into a DataFrame
    df = pd.read_csv(file_path, encoding='ISO-8859-1')
    
    # Create a unified column: use 'Doses delivered' unless it is NaN, then fall back to 'Donated doses'
    df['Doses_used'] = df['Doses delivered'].fillna(df['Donated doses'])
    
    # Pivot: index by country, columns by Mechanism, summing the doses
    pivot = (
        df
        .pivot_table(
            index='Donor',
            columns='Mechanism',
            values='Doses_used',
            aggfunc='sum',
            fill_value=0
        )
    )
    
    # Add a 'Total Doses' column summing across all mechanisms for each country
    pivot['Total Doses'] = pivot.sum(axis=1)
    
    # (Optional) Reorder columns so 'Total Doses' is first
    cols = ['Total Doses'] + [c for c in pivot.columns if c != 'Total Doses']
    pivot = pivot[cols]
    
    return pivot

# Replace this path with the location of your CSV file
csv_path = r'Vaccine Donations(Vaccine Donations 13052024).csv'

aggregate_vaccine_donations = aggregate_vaccine_doses(csv_path)
# aggregate_vaccine_donations.to_csv('aggregated_vaccines.csv', index=True)

In [5]:
def clean_and_aggregate(df, country_col='Donor'):
    """
    Standardizes entries in `country_col` to ISO 3166 common names (handling subdivisions and synonyms),
    then consolidates duplicate rows by summing all numeric columns.
    
    Automatically resets index into a column if `country_col` is the index name.
    Raises a clear error listing available columns if `country_col` is not found.
    """
    df_copy = df.copy()

    # If the identifier is the index rather than a column, move it into a column
    if country_col not in df_copy.columns and df_copy.index.name == country_col:
        df_copy = df_copy.reset_index()

    # If still missing, show what's available
    if country_col not in df_copy.columns:
        available = df_copy.columns.tolist()
        idx_name = df_copy.index.name
        raise KeyError(f"Column '{country_col}' not found. Available columns: {available}. Index name: {idx_name!r}")

    # Mapping function
    def map_to_country(name):
        try:
            c = pycountry.countries.lookup(name)
            return c.name
        except LookupError:
            # subdivison match
            for sub in pycountry.subdivisions:
                if name.lower() in sub.name.lower():
                    parent = pycountry.countries.get(alpha_2=sub.country_code)
                    return parent.name
            # manual synonyms
            synonyms = {
                'Republic of Korea': 'Korea, Republic of',
                'Iran (Islamic Republic of)': 'Iran, Islamic Republic of',
                'Turkey': 'Türkiye',
                'China, Hong Kong SAR': 'China',
                'Taiwan': 'Taiwan, Province of China',
                'Bonaire Sint Eustatius and Saba': 'Bonaire, Sint Eustatius and Saba',
                'Cape Verde': 'Cabo Verde',
                "Cote d'Ivoire": "Côte d'Ivoire",
                'Curacao': 'Curaçao',
                'Democratic Republic of Congo': 'Congo, The Democratic Republic of the',
                'East Timor': 'Timor-Leste',
                'Falkland Islands': 'Falkland Islands (Malvinas)',
                'Micronesia (country)': 'Micronesia, Federated States of',
                'Palestine': 'State of Palestine',
                'Reunion': 'Réunion',
                'Russia': 'Russian Federation',
                'Saint Barthelemy': 'Saint Barthélemy',
                'United States Virgin Islands': 'Virgin Islands, U.S.',
                'Vatican': 'Holy See'
            }
            return synonyms.get(name, name)

    # Apply mapping
    df_copy[country_col] = df_copy[country_col].astype(str).apply(map_to_country)

    # Separate numeric vs other columns
    numeric = df_copy.select_dtypes(include='number').columns.tolist()
    others = [c for c in df_copy.columns if c not in numeric + [country_col]]

    # Build aggregation dict
    agg_dict = {c: 'sum' for c in numeric}
    agg_dict.update({c: 'first' for c in others})

    # Group and aggregate
    return df_copy.groupby(country_col, as_index=False).agg(agg_dict)

standardize_aggregate_vaccine_donations = clean_and_aggregate(aggregate_vaccine_donations, country_col='Donor')


In [6]:
standardize_aggregate_vaccine_donations

Mechanism,Donor,Total Doses,Bilateral,Multilateral,Private,Through African Union,Through COVAX
0,A Dose of Hope,16520000.0,0.0,0.0,16520000.0,0.0,0.0
1,Algeria,1700000.0,1700000.0,0.0,0.0,0.0,0.0
2,Alrosa Group,75000.0,0.0,0.0,75000.0,0.0,0.0
3,Argentina,4272000.0,4272000.0,0.0,0.0,0.0,0.0
4,AstraZeneca,31400.0,31400.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
104,United States,696916480.0,79988820.0,0.0,0.0,0.0,616927660.0
105,Unknown,5000.0,5000.0,0.0,0.0,0.0,0.0
106,Uruguay,12000.0,12000.0,0.0,0.0,0.0,0.0
107,Uzbekistan,600000.0,600000.0,0.0,0.0,0.0,0.0


In [7]:
private_donors = (standardize_aggregate_vaccine_donations
                  .loc[standardize_aggregate_vaccine_donations['Private'] > 0, 'Donor'])
print(private_donors.tolist())

['A Dose of Hope', 'Alrosa Group', 'AstraZeneca Viet Nam', 'Bharat Biotech', 'Makhzoumi Foundation', 'Norgold', 'Pfizer/BioNTech', 'Rostec', 'Sesiu Sa Let\x9aoele Le Beta Poho', 'Sinopharm', 'Sinovac', 'TSMC, Hon Hai Yonglin, and Tzu Chi']


In [8]:
def filter_states_with_removed(df, country_col='Donor'):
    """
    Filters DataFrame to include only sovereign states based on ISO 3166,
    and prints the list of removed non-state entries.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with country/entity names.
        country_col (str): Column in df containing the names to filter.
    
    Returns:
        pd.DataFrame: DataFrame containing only valid ISO 3166 countries.
    """
    # Compile valid country names (common + official)
    valid_common = {country.name for country in pycountry.countries}
    valid_official = {
        getattr(country, 'official_name', country.name) 
        for country in pycountry.countries
    }
    valid_countries = valid_common.union(valid_official)

    # Identify rows to keep and to remove
    mask = df[country_col].isin(valid_countries)
    df_states = df[mask].copy()
    df_removed = df[~mask].copy()

    # Print removed entries
    removed_list = df_removed[country_col].unique().tolist()
    print("Removed non-state entries:")
    for entry in removed_list:
        print("  -", entry)

    return df_states

donations_df = filter_states_with_removed(standardize_aggregate_vaccine_donations)

Removed non-state entries:
  - A Dose of Hope
  - Alrosa Group
  - AstraZeneca
  - AstraZeneca Viet Nam
  - Bharat Biotech
  - Conmebol
  - European Union
  - International Committee of the Red Cross
  - International Red Crescent Society
  - MTN Group
  - Makhzoumi Foundation
  - Moderna
  - Norgold
  - Pfizer/BioNTech
  - QUAD
  - Rostec
  - Sesiu Sa Letoele Le Beta Poho
  - Sinopharm
  - Sinovac
  - TSMC, Hon Hai Yonglin, and Tzu Chi
  - Unknown


In [9]:
donations_df

Mechanism,Donor,Total Doses,Bilateral,Multilateral,Private,Through African Union,Through COVAX
1,Algeria,1700000.0,1700000.0,0.0,0.0,0.0,0.0
3,Argentina,4272000.0,4272000.0,0.0,0.0,0.0,0.0
6,Australia,40526290.0,40511890.0,0.0,0.0,0.0,14400.0
7,Austria,9701520.0,4181520.0,0.0,0.0,0.0,5520000.0
8,Azerbaijan,270000.0,270000.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
103,United Kingdom,52722250.0,5780570.0,0.0,0.0,0.0,46941680.0
104,United States,696916480.0,79988820.0,0.0,0.0,0.0,616927660.0
106,Uruguay,12000.0,12000.0,0.0,0.0,0.0,0.0
107,Uzbekistan,600000.0,600000.0,0.0,0.0,0.0,0.0


## Get COVID-19 data ##

The WHO ended the PHEIC for COVID-19 on 5 May 2023: https://www.reuters.com/business/healthcare-pharmaceuticals/covid-is-no-longer-global-health-emergency-who-2023-05-05/

Therefore will use KFF data of H1 2022: 
https://github.com/KFFData/COVID-19-Data/blob/fccdb0ad4598c08f00e838da818577b8f81c9444/Country%20Trend%20Data/global_covid_metrics2023_H1.csv

In [10]:
def load_and_pivot_covid_data():
    """
    Fetches OWID COVID-19 cases/deaths data and pivots it so that:
      - Rows are unique countries
      - Columns are total_cases and total_deaths at half-year intervals
    Returns:
      pd.DataFrame: pivoted DataFrame
    """
    # 1. Load the data from GitHub raw URL
    url = (
        "https://raw.githubusercontent.com/owid/covid-19-data/"
        "c6b482425695ed67d3fff85ce614fc4189cf2c17/"
        "public/data/cases_deaths/full_data.csv"
    )
    df = pd.read_csv(url, parse_dates=['date'])

    # 2. Define half-year cutoffs (6-month frequency)
    half_year_dates = pd.date_range(
        start=df['date'].min(), end=df['date'].max(), freq='6ME'
    )

    # 3. For each country and each cutoff, get the most recent data on or before the cutoff
    records = []
    for country, group in df.groupby('location'):
        grp = group.sort_values('date')
        for cutoff in half_year_dates:
            sub = grp[grp['date'] <= cutoff]
            if not sub.empty:
                last = sub.iloc[-1]
                records.append({
                    'location': country,
                    'cutoff': cutoff,
                    'total_cases': last['total_cases'],
                    'total_deaths': last['total_deaths']
                })

    # 4. Build a DataFrame and pivot
    temp = pd.DataFrame(records)
    pivot = temp.pivot_table(
        index='location',
        columns='cutoff',
        values=['total_cases', 'total_deaths']
    )

    # 5. Flatten column MultiIndex for readability
    pivot.columns = [
        f"{metric}_{date.strftime('%Y-%m')}" 
        for metric, date in pivot.columns
    ]
    pivot = pivot.reset_index()

    return pivot

raw_covid_data = load_and_pivot_covid_data()
print(raw_covid_data .head())


         location  total_cases_2020-01  total_cases_2020-07  \
0     Afghanistan                  0.0              36036.0   
1          Africa                  0.0             836826.0   
2         Albania                  0.0               4570.0   
3         Algeria                  0.0              26764.0   
4  American Samoa                  0.0                  0.0   

   total_cases_2021-01  total_cases_2021-07  total_cases_2022-01  \
0              55023.0             143871.0             161666.0   
1            3577719.0            6511749.0           11068160.0   
2              76350.0             132828.0             255741.0   
3             107122.0             160868.0             249310.0   
4                  0.0                  0.0                 18.0   

   total_cases_2022-07  total_cases_2023-01  total_cases_2023-07  \
0             185580.0             208420.0             224224.0   
1           12739160.0           13035459.0           13105823.0   
2       

In [11]:
def fix_and_consolidate_countries(df, country_col='Donor'):
    """
    Standardize and consolidate country names in `country_col` to ISO 3166 common names.
    Detects subdivisions, maps them to their parent country, and merges duplicate rows
    by summing all numeric columns.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing country or subdivision names.
        country_col (str): The column name in df with country/subdivision names.

    Returns:
        pd.DataFrame: A new DataFrame with:
            - `country_col` standardized to ISO 3166 common country names
            - Duplicate country rows merged (numeric columns summed, first value for others)
    """
    df_copy = df.copy()

    # If the identifier is in the index, reset it
    if country_col not in df_copy.columns and df_copy.index.name == country_col:
        df_copy = df_copy.reset_index()

    if country_col not in df_copy.columns:
        raise KeyError(f"Column '{country_col}' not found in DataFrame.")

    # Normalize function to replace curly apostrophes and standardize accents
    def normalize(name):
        name = name.replace("’", "'").replace("‘", "'")
        return unicodedata.normalize("NFC", name)

    # Comprehensive synonyms mapping
    synonyms = {
        'Republic of Korea': 'Korea, Republic of',
        'Iran (Islamic Republic of)': 'Iran, Islamic Republic of',
        'Turkey': 'Türkiye',
        'China, Hong Kong SAR': 'China',
        'Taiwan': 'Taiwan, Province of China',
        'Bonaire Sint Eustatius and Saba': 'Bonaire, Sint Eustatius and Saba',
        'Cape Verde': 'Cabo Verde',
        "Cote d'Ivoire": "Côte d'Ivoire",
        'Curacao': 'Curaçao',
        'Democratic Republic of Congo': 'Congo, The Democratic Republic of the',
        'East Timor': 'Timor-Leste',
        'Falkland Islands': 'Falkland Islands (Malvinas)',
        'Micronesia (country)': 'Micronesia, Federated States of',
        'Palestine': 'State of Palestine',
        'Reunion': 'Réunion',
        'Russia': 'Russian Federation',
        'Saint Barthelemy': 'Saint Barthélemy',
        'United States Virgin Islands': 'Virgin Islands, U.S.',
        'Vatican': 'Holy See',
        'Brunei': 'Brunei Darussalam'
    }

    # Build subdivision lookup: map lowercase name -> parent alpha_2
    sub_map = {sub.name.lower(): sub.country_code for sub in pycountry.subdivisions}

    def map_to_iso(name):
        orig = name
        name = normalize(str(name))
        # Direct country lookup
        try:
            country = pycountry.countries.lookup(name)
            return country.name
        except LookupError:
            # Subdivision match
            lower = name.lower()
            for sub_name, country_code in sub_map.items():
                if sub_name in lower:
                    parent = pycountry.countries.get(alpha_2=country_code)
                    return parent.name
            # Synonym fallback
            if orig in synonyms:
                return synonyms[orig]
            return orig  # leave unchanged

    # Apply mapping
    df_copy[country_col] = df_copy[country_col].apply(map_to_iso)

    # Identify numeric vs non-numeric columns
    numeric_cols = df_copy.select_dtypes(include='number').columns.tolist()
    other_cols = [c for c in df_copy.columns if c not in numeric_cols + [country_col]]

    # Aggregation rules
    agg_dict = {col: 'sum' for col in numeric_cols}
    agg_dict.update({col: 'first' for col in other_cols})

    # Group by standardized country names and aggregate
    consolidated = df_copy.groupby(country_col, as_index=False).agg(agg_dict)
    return consolidated


cleaned_raw_covid_data = fix_and_consolidate_countries(raw_covid_data, country_col='location')



In [12]:
cleaned_raw_covid_data

Unnamed: 0,location,total_cases_2020-01,total_cases_2020-07,total_cases_2021-01,total_cases_2021-07,total_cases_2022-01,total_cases_2022-07,total_cases_2023-01,total_cases_2023-07,total_cases_2024-01,...,total_deaths_2020-01,total_deaths_2020-07,total_deaths_2021-01,total_deaths_2021-07,total_deaths_2022-01,total_deaths_2022-07,total_deaths_2023-01,total_deaths_2023-07,total_deaths_2024-01,total_deaths_2024-07
0,Afghanistan,0.0,36036.0,55023.0,143871.0,161666.0,185580.0,208420.0,224224.0,231310.0,...,0.0,1246.0,2400.0,6425.0,7407.0,7747.0,7866.0,7935.0,7981.0,7998.0
1,Albania,0.0,4570.0,76350.0,132828.0,255741.0,309278.0,333219.0,334090.0,334863.0,...,0.0,128.0,1358.0,2456.0,3334.0,3538.0,3596.0,3604.0,3605.0,3605.0
2,Algeria,0.0,26764.0,107122.0,160868.0,249310.0,267374.0,271369.0,271852.0,272010.0,...,0.0,1134.0,2888.0,4042.0,6555.0,6876.0,6881.0,6881.0,6881.0,6881.0
3,American Samoa,0.0,0.0,0.0,0.0,18.0,7766.0,8320.0,8341.0,8359.0,...,0.0,0.0,0.0,0.0,0.0,33.0,34.0,34.0,34.0,34.0
4,Andorra,0.0,897.0,9885.0,14498.0,35556.0,45508.0,47839.0,48015.0,48015.0,...,0.0,52.0,101.0,127.0,145.0,153.0,159.0,159.0,159.0,159.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,Wallis and Futuna,0.0,0.0,5.0,453.0,453.0,761.0,3427.0,3550.0,3550.0,...,0.0,0.0,0.0,7.0,7.0,7.0,7.0,8.0,8.0,9.0
228,World,2032.0,15902862.0,102386678.0,193839779.0,375307809.0,575000013.0,752304891.0,768351226.0,774443767.0,...,62.0,708332.0,2381520.0,4204805.0,5715124.0,6430064.0,6781642.0,6956622.0,7029872.0,7056317.0
229,Yemen,0.0,1678.0,2124.0,7008.0,10998.0,11877.0,11945.0,11945.0,11945.0,...,0.0,475.0,616.0,1373.0,2007.0,2151.0,2159.0,2159.0,2159.0,2159.0
230,Zambia,0.0,4328.0,53352.0,191527.0,304656.0,329483.0,339743.0,349287.0,349304.0,...,0.0,140.0,745.0,3250.0,3914.0,4015.0,4041.0,4069.0,4069.0,4077.0


In [13]:
covid_data_df = filter_states_with_removed(cleaned_raw_covid_data, country_col='location')

Removed non-state entries:
  - Asia
  - Europe
  - European Union (27)
  - Kosovo
  - Low-income countries
  - Lower-middle-income countries
  - Oceania
  - Upper-middle-income countries
  - World


## Get V-DEM data ##

https://www.v-dem.net/media/datasets/V-Dem-CY-Core-v15_csv.zip

In [14]:
def get_vdem_scores_by_years(zip_url, years=range(2019, 2026)):
    """
    Downloads the V-Dem core dataset ZIP, extracts the CSV, and returns a DataFrame
    with one row per country, and for each year in `years` two columns:
      - Regimes of the World classification {year}
      - Liberal democracy index        {year}
    """
    # 1. Download and open ZIP
    resp = requests.get(zip_url)
    resp.raise_for_status()
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    # 2. Read the single CSV inside
    csv_name = next(name for name in zf.namelist() if name.endswith('.csv'))
    with zf.open(csv_name) as f:
        df = pd.read_csv(f, low_memory=False)
    
    # 3. Filter to the years we care about
    df = df[df['year'].isin(years)]
    
    # 4. Pivot into wide form
    pivot = (
        df
        .pivot_table(
            index='country_name',
            columns='year',
            values=['v2x_regime', 'v2x_libdem']
        )
    )
    
    # 5. Flatten the MultiIndex columns and rename
    pivot.columns = [
        f"{ 'Regimes of the World classification' if metric=='v2x_regime' else 'Liberal democracy index' }_{year}"
        for metric, year in pivot.columns
    ]
    
    # 6. Re‐index to make country_name a column (optional)
    pivot = pivot.reset_index().rename(columns={'country_name':'Country'})
    
    return pivot

# Usage:
url = "https://www.v-dem.net/media/datasets/V-Dem-CY-Core-v15_csv.zip"
df_vdem = get_vdem_scores_by_years(url)

In [15]:
def fix_iso_countries(df, country_col='Country'):
    """
    Standardizes and consolidates country names in `country_col` to ISO 3166 common names.
    Detects subdivisions, applies manual synonyms (including Burma/Myanmar, Kosovo, The Gambia),
    and merges duplicate rows by summing numeric columns.
    """
    df_copy = df.copy()
    
    # If country names are in the index, reset into a column
    if country_col not in df_copy.columns and df_copy.index.name == country_col:
        df_copy = df_copy.reset_index()
    
    if country_col not in df_copy.columns:
        raise KeyError(f"Column '{country_col}' not found. Available columns: {df_copy.columns.tolist()}")
    
    # Normalize text: straighten apostrophes, NFC accents
    def normalize(name):
        name = name.replace("’", "'").replace("‘", "'")
        return unicodedata.normalize("NFC", name)
    
    # Extended synonyms mapping
    synonyms = {
        'Burma/Myanmar': 'Myanmar',
        'Burma': 'Myanmar',
        'Kosovo': 'Kosovo',  # Kosovo uses unofficial ISO code XKX; keep as-is or exclude if desired
        'The Gambia': 'Gambia',
        # existing mappings...
        'Republic of Korea': 'Korea, Republic of',
        'Iran (Islamic Republic of)': 'Iran, Islamic Republic of',
        'Turkey': 'Türkiye',
        'China, Hong Kong SAR': 'China',
        'Taiwan': 'Taiwan, Province of China',
        'Bonaire, Sint Eustatius and Saba': 'Bonaire, Sint Eustatius and Saba',
        'Cape Verde': 'Cabo Verde',
        "Côte d'Ivoire": "Côte d'Ivoire",
        'Curacao': 'Curaçao',
        'Democratic Republic of Congo': 'Congo, The Democratic Republic of the',
        'East Timor': 'Timor-Leste',
        'Falkland Islands': 'Falkland Islands (Malvinas)',
        'Micronesia (country)': 'Micronesia, Federated States of',
        'Palestine': 'State of Palestine',
        'Reunion': 'Réunion',
        'Russia': 'Russian Federation',
        'Saint Barthelemy': 'Saint Barthélemy',
        'United States Virgin Islands': 'Virgin Islands, U.S.',
        'Vatican': 'Holy See'
    }
    
    # Subdivision lookup (lowercase -> parent country_code)
    sub_map = {sub.name.lower(): sub.country_code for sub in pycountry.subdivisions}
    
    def map_to_iso(name):
        orig = name
        name = normalize(str(name))
        # 1. Direct country lookup
        try:
            country = pycountry.countries.lookup(name)
            return country.name
        except LookupError:
            # 2. Subdivision match
            lower = name.lower()
            for sub_name, code in sub_map.items():
                if sub_name in lower:
                    parent = pycountry.countries.get(alpha_2=code)
                    return parent.name
            # 3. Synonym fallback
            return synonyms.get(orig, orig)
    
    # Apply mapping
    df_copy[country_col] = df_copy[country_col].apply(map_to_iso)
    
    # Consolidate: sum numeric, first for others
    nums = df_copy.select_dtypes(include='number').columns.tolist()
    others = [c for c in df_copy.columns if c not in nums + [country_col]]
    agg = {c: 'sum' for c in nums}
    agg.update({c: 'first' for c in others})
    
    return df_copy.groupby(country_col, as_index=False).agg(agg)


df_vdem_clean = fix_iso_countries(df_vdem, country_col='Country')
print(df_vdem_clean.head())


       Country  Liberal democracy index_2019  Liberal democracy index_2020  \
0  Afghanistan                         0.165                         0.168   
1      Albania                         0.424                         0.440   
2      Algeria                         0.145                         0.146   
3       Angola                         0.162                         0.160   
4    Argentina                         0.626                         0.644   

   Liberal democracy index_2021  Liberal democracy index_2022  \
0                         0.041                         0.012   
1                         0.438                         0.433   
2                         0.132                         0.115   
3                         0.153                         0.154   
4                         0.640                         0.661   

   Liberal democracy index_2023  Liberal democracy index_2024  \
0                         0.016                         0.016   
1         

In [16]:
df_vdem_only_contries = filter_states_with_removed(df_vdem_clean, country_col='Country')

Removed non-state entries:
  - Kosovo


## Get world Bank data ##

https://data360.worldbank.org/en/api

In [17]:
def fetch_data360_indicator(indicator, year, limit=10000):
    """
    Fetch a single indicator-year dataset from Data360 API.
    Returns a DataFrame with columns ['iso3', 'Country', f'{prefix}_{year}'].
    """
    url = "https://data360api.worldbank.org/data360/data"
    params = {
        "DATABASE_ID": "WB_WDI",
        "INDICATOR": indicator,
        "TIME_PERIOD": year,
        "skip": 0,
        "limit": limit
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    payload = resp.json()

    # Extract list of records
    if isinstance(payload, list):
        records = payload
    else:
        records = next((v for v in payload.values() if isinstance(v, list)), [])
    if not records:
        print(f"No data returned for {indicator} in {year}")
        return pd.DataFrame(columns=['iso3', 'Country', f'value_{year}'])

    df = pd.DataFrame(records)

    # Detect ISO3 column
    iso3_col = next(
        (c for c in df.columns if 'ref_area' in c.lower() or ('country' in c.lower() and 'code' in c.lower())),
        None
    )
    if iso3_col is None:
        raise KeyError(f"No country code column in {df.columns.tolist()}")

    # Detect value column
    val_col = next(
        (c for c in df.columns if 'obs_value' in c.lower() or c.lower()=='value' or ('obs' in c.lower() and 'value' in c.lower())),
        None
    )
    if val_col is None:
        raise KeyError(f"No value column in {df.columns.tolist()}")

    # Detect name/label column
    name_col = next((c for c in df.columns if 'ref_area' in c.lower() and 'name' in c.lower()), None)
    if name_col is None:
        name_col = next((c for c in df.columns if 'country' in c.lower() and 'name' in c.lower()), None)

    prefixes = {
        "WB_WDI_NY_GDP_PCAP_CD": "gdp_per_capita",
        "WB_WDI_SP_POP_TOTL":    "population",
        "WB_WDI_AG_LND_TOTL_K2": "land_area_sq_km"
    }
    prefix = prefixes.get(indicator, indicator)
    metric_col = f"{prefix}_{year}"

    if name_col:
        df2 = df[[iso3_col, name_col, val_col]].rename(
            columns={iso3_col:'iso3', name_col:'Country', val_col:metric_col}
        )
    else:
        df2 = df[[iso3_col, val_col]].rename(
            columns={iso3_col:'iso3', val_col:metric_col}
        )

    return df2

def load_combined_data(gdp_years=range(2019, 2025), pop_year=2020, land_year=2020):
    """
    Retrieves and merges GDP per capita (2019-2024), population (2020), and land area (2020).
    Ensures a single Country column with human-readable names.
    """
    # 1. First GDP year (pulls in Country if available)
    first_year = next(iter(gdp_years))
    df = fetch_data360_indicator("WB_WDI_NY_GDP_PCAP_CD", first_year)

    # 2. Merge remaining GDP years
    for yr in list(gdp_years)[1:]:
        tmp = fetch_data360_indicator("WB_WDI_NY_GDP_PCAP_CD", yr)
        tmp = tmp.drop(columns=['Country'], errors='ignore')   # <-- use errors='ignore'
        df = df.merge(tmp, on='iso3', how='outer')

    # 3. Population
    pop_df = fetch_data360_indicator("WB_WDI_SP_POP_TOTL", pop_year)
    pop_df = pop_df.drop(columns=['Country'], errors='ignore')
    df = df.merge(pop_df, on='iso3', how='outer')

    # 4. Land area
    land_df = fetch_data360_indicator("WB_WDI_AG_LND_TOTL_K2", land_year)
    land_df = land_df.drop(columns=['Country'], errors='ignore')
    df = df.merge(land_df, on='iso3', how='outer')

    # 5. Fill any missing Country via pycountry
    df['iso3_clean'] = df['iso3'].astype(str).str.strip().str.upper()
    df['Country'] = df.apply(
        lambda row: row['Country']
                    if pd.notna(row.get('Country'))
                    else (pycountry.countries.get(alpha_3=row['iso3_clean']).name
                          if pycountry.countries.get(alpha_3=row['iso3_clean'])
                          else row['iso3_clean']),
        axis=1
    )
    df.drop(columns=['iso3_clean'], inplace=True)

    # 6. (Optional) remove aggregates/non‑standard codes
    valid_codes = {c.alpha_3 for c in pycountry.countries}
    df = df[df['iso3'].isin(valid_codes)]

    # 7. Reorder columns
    cols = ['iso3', 'Country'] + [c for c in df.columns if c not in ('iso3','Country')]
    return df[cols]

world_bank_data = load_combined_data()
print(world_bank_data.head())


  iso3      Country gdp_per_capita_2019 gdp_per_capita_2020  \
0  ABW        Aruba             31096.2             22855.9   
2  AFG  Afghanistan             496.603             510.787   
4  AGO       Angola             2189.86             1449.92   
5  ALB      Albania             5460.43             5370.78   
6  AND      Andorra             41257.8             37361.1   

  gdp_per_capita_2021 gdp_per_capita_2022 gdp_per_capita_2023  \
0             27200.1             30559.5             33984.8   
2             356.496             357.261             413.758   
4             1925.87             2929.69             2309.53   
5             6413.28             6846.43             8575.17   
6             42425.7             42414.1             46812.4   

  gdp_per_capita_2024 population_2020 land_area_sq_km_2020  
0                 NaN          108587                  180  
2                 NaN     3.9069e+007               652230  
4             2122.08    3.34511e+007          

## Get Elcano Global Presence Index  data ##

https://www.globalpresence.realinstitutoelcano.org/en/swagger

In [18]:
def fetch_elcano_year_csv(year):
    """
    Fetches the Elcano Global Presence CSV for a given year and returns a DataFrame
    with columns suffixed by the year (except COUNTRY and REGION).
    """
    url = (
        "https://elcano-backend-route-elcano."
        "ocp-elcano-428ad9c1ea2be222aeb3a70cce56e8e1-0000.eu-de."
        "containers.appdomain.cloud/api/generalAPI"
    )
    headers = {
        "Accept": "*/*",
        "Content-Type": "application/json",
        "Origin": "https://www.globalpresence.realinstitutoelcano.org",
        "Referer": "https://www.globalpresence.realinstitutoelcano.org/"
    }
    payload = {"codesISO": [], "years": [year]}

    resp = requests.post(url, headers=headers, json=payload)
    resp.raise_for_status()
    
    # Read response text into DataFrame
    df = pd.read_csv(StringIO(resp.text))
    
    # Drop the YEAR column (redundant)
    if 'YEAR' in df.columns:
        df = df.drop(columns=['YEAR'])
    
    # Suffix metric columns with the year
    suffix = f"_{year}"
    df = df.set_index(['COUNTRY', 'REGION'])
    df = df.rename(columns={col: f"{col}{suffix}" for col in df.columns})
    df = df.reset_index()
    
    return df

def load_elcano_2019_2024():
    """
    Loads and merges Elcano Global Presence Index data from 2019 through 2024
    into a single DataFrame with one row per country.
    """
    years = range(2019, 2025)
    merged_df = None

    for year in years:
        df_year = fetch_elcano_year_csv(year)
        merged_df = df_year if merged_df is None else merged_df.merge(
            df_year, on=['COUNTRY', 'REGION'], how='outer'
        )
    
    return merged_df


df_all = load_elcano_2019_2024()
print(df_all.head())


       COUNTRY REGION  ENERGY_2019  PRIMARY_GOODS_2019  MANUFACTURES_2019  \
0  Afghanistan    NaN     0.015945            0.099791           0.008567   
1      Albania    NaN     0.047098            0.068180           0.314348   
2      Algeria    NaN     8.442406            4.704664           0.512515   
3       Angola    NaN     6.820430            4.855170           0.053621   
4    Argentina    NaN     0.553576            4.629735           2.182548   

   SERVICES_2019  INVESTMENTS_2019  TROOPS_2019  MILITARY_EQUIPMENT_2019  \
0       0.286874          0.009539     0.000000                 0.969775   
1       1.433636          0.075642     0.448568                 0.000000   
2       1.247113          0.366926     0.006705                28.127822   
3       0.230657          0.815540     0.000000                 2.424436   
4       5.959289          5.673580     0.559388                15.903342   

   MIGRATIONS_2019  ...  DEV_COOPERATION_CONT_2024  CLIMATE_CONT_2024  \
0      

In [19]:
def clean_elcano_countries(df, country_col='COUNTRY', region_col='REGION'):
    """
    Maps known synonyms, filters to ISO countries, drops the region column,
    and reports any dropped entries.
    """
    df_copy = df.copy()
    
    # Manual synonyms mapping
    synonyms = {
        'Bolivia': 'Bolivia, Plurinational State of',
        'Brunei': 'Brunei Darussalam',
        'Congo, DR': 'Congo, The Democratic Republic of the',
        'Iran': 'Iran, Islamic Republic of',
        'Laos': "Lao People's Democratic Republic",
        'Moldova': 'Moldova, Republic of',
        'Russia': 'Russian Federation',
        'South Korea': 'Korea, Republic of',
        'Syria': 'Syrian Arab Republic',
        'Tanzania': 'Tanzania, United Republic of',
        'Turkey': 'Türkiye',
        'Venezuela': 'Venezuela, Bolivarian Republic of',
        'Vietnam': 'Viet Nam'
    }
    # Apply synonyms
    df_copy[country_col] = df_copy[country_col].replace(synonyms)
    
    # Build valid ISO names set
    valid_common = {c.name for c in pycountry.countries}
    valid_official = {getattr(c, 'official_name', c.name) for c in pycountry.countries}
    valid_names = valid_common.union(valid_official)
    
    # Filter mask
    mask = df_copy[country_col].isin(valid_names)
    
    # Report dropped entries
    dropped = df_copy.loc[~mask, country_col].dropna().unique().tolist()
    if dropped:
        print("Dropped non-country entries:", dropped)
    else:
        print("All entries are valid countries after mapping.")
    
    # Filter and drop REGION column
    result = df_copy.loc[mask].drop(columns=[region_col]).reset_index(drop=True)
    return result

df_clean = clean_elcano_countries(df_all)
print(df_clean.head())


Dropped non-country entries: ['European Union']
       COUNTRY  ENERGY_2019  PRIMARY_GOODS_2019  MANUFACTURES_2019  \
0  Afghanistan     0.015945            0.099791           0.008567   
1      Albania     0.047098            0.068180           0.314348   
2      Algeria     8.442406            4.704664           0.512515   
3       Angola     6.820430            4.855170           0.053621   
4    Argentina     0.553576            4.629735           2.182548   

   SERVICES_2019  INVESTMENTS_2019  TROOPS_2019  MILITARY_EQUIPMENT_2019  \
0       0.286874          0.009539     0.000000                 0.969775   
1       1.433636          0.075642     0.448568                 0.000000   
2       1.247113          0.366926     0.006705                28.127822   
3       0.230657          0.815540     0.000000                 2.424436   
4       5.959289          5.673580     0.559388                15.903342   

   MIGRATIONS_2019  TOURISM_2019  ...  DEV_COOPERATION_CONT_2024  \
0     

In [20]:
elcano_df = df_clean
elcano_df

Unnamed: 0,COUNTRY,ENERGY_2019,PRIMARY_GOODS_2019,MANUFACTURES_2019,SERVICES_2019,INVESTMENTS_2019,TROOPS_2019,MILITARY_EQUIPMENT_2019,MIGRATIONS_2019,TOURISM_2019,...,DEV_COOPERATION_CONT_2024,CLIMATE_CONT_2024,ECONOMIC_CONT_2024,MILITARY_CONT_2024,SOFT_CONT_2024,ECONOMIC_SHARE_2024,MILITARY_SHARE_2024,SOFT_SHARE_2024,GLOBAL_SHARE_2024,GOBAL_PERCENTILE_2024
0,Afghanistan,0.015945,0.099791,0.008567,0.286874,0.009539,0.000000,0.969775,0.193565,0.000000,...,0.000000,0.134592,0.273698,0.000000,0.726301,0.000046,0.000000,0.000267,0.000092,0.026667
1,Albania,0.047098,0.068180,0.314348,1.433636,0.075642,0.448568,0.000000,0.063783,0.946100,...,0.000000,0.040548,0.479370,0.053277,0.467351,0.000330,0.000099,0.000700,0.000376,0.313333
2,Algeria,8.442406,4.704664,0.512515,1.247113,0.366926,0.006705,28.127822,0.314995,0.438859,...,0.000000,0.034594,0.363747,0.532686,0.103565,0.001720,0.006799,0.001066,0.002585,0.673333
3,Angola,6.820430,4.855170,0.053621,0.230657,0.815540,0.000000,2.424436,0.854808,0.036007,...,0.000000,0.068891,0.669548,0.173371,0.157079,0.001026,0.000717,0.000524,0.000837,0.473333
4,Argentina,0.553576,4.629735,2.182548,5.959289,5.673580,0.559388,15.903342,2.984973,1.146452,...,0.006304,0.068310,0.343527,0.169565,0.486907,0.001855,0.002471,0.005721,0.002950,0.693333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,"Venezuela, Bolivarian Republic of",6.142827,3.481291,0.985472,0.186545,3.349223,0.000000,10.913437,1.746935,0.066399,...,0.000000,0.147482,0.303723,0.334703,0.361573,0.000587,0.001746,0.001521,0.001056,0.540000
146,Viet Nam,0.858603,4.498912,39.236671,5.789978,1.433308,0.017894,4.416165,0.066829,2.559655,...,0.000000,0.079811,0.778797,0.007027,0.214174,0.006451,0.000157,0.003861,0.004526,0.773333
147,Yemen,0.061052,0.061836,0.009995,0.102181,0.089884,0.000000,0.000000,0.499105,0.071740,...,0.000000,0.051053,0.069330,0.000000,0.930669,0.000023,0.000000,0.000672,0.000181,0.140000
148,Zambia,0.021834,0.951853,0.212296,0.371746,0.234988,2.249612,0.000000,0.237497,0.177063,...,0.000000,0.138822,0.263897,0.430860,0.305241,0.000157,0.000690,0.000394,0.000324,0.266667


## Get The Global Soft Power Index (GSPI) / Brand Finance’s Global Soft Power Index data ##

https://brandirectory.com/softpower, Dashboard: https://brandirectory.com/softpower/ranking?region=1&metric=1&fromRegion=1 , API: https://admin.brandirectory.com/api/gsp/waves

https://static.brandirectory.com/reports/brand-finance-global-soft-power-index-2020.pdf <br>
https://static.brandirectory.com/reports/brand-finance-global-soft-power-index-2021.pdf <br>
https://static.brandirectory.com/reports/brand-finance-soft-power-index-2022.pdf <br>
https://static.brandirectory.com/reports/brand-finance-soft-power-index-2023-digital.pdf <br>
https://static.brandirectory.com/reports/brand-finance-soft-power-index-2024-digital.pdf <br>
https://static.brandirectory.com/reports/brand-finance-soft-power-index-2025-digital.pdf

### 2020 ###

In [21]:
dfs2020 = tabula.read_pdf("https://static.brandirectory.com/reports/brand-finance-global-soft-power-index-2020.pdf", pages='55-56', stream=True, multiple_tables=True)


In [22]:
def clean_soft_power_table(df: pd.DataFrame,
                           nation_col_index: int = 2,
                           new_cols: list[str] | None = None) -> pd.DataFrame:
    """
    Clean a soft power table DataFrame extracted via tabula.

    Steps:
    1. Reinserts the current header row as the first data row.
    2. Drops rows where the nation column is null.
    3. Truncates to len(new_cols) columns and renames them if new_cols is provided.
    4. Strips whitespace from Nation.
    5. Moves original Influence into Reputation.
    6. Cleans Familiarity of any letters, then splits it into Familiarity + Influence.
    7. Cleans Medals - Specialist Audiences to keep only digits and hyphens.
    """
    # 1) Reinstate header as a row
    header = pd.DataFrame([df.columns.tolist()], columns=df.columns)
    df = pd.concat([header, df], ignore_index=True)

    # 2) Drop rows missing a nation
    df = df[df.iloc[:, nation_col_index].notna()].reset_index(drop=True)

    # 3) Truncate and rename
    if new_cols:
        df = df.iloc[:, :len(new_cols)].copy()
        df.columns = new_cols

    # 4) Strip any extra space around Nation
    df['Nation'] = df['Nation'].astype(str).str.strip()

    # 5) Move the existing Influence values into Reputation
    df['Reputation'] = df['Influence']

    # 6a) Remove all letters from Familiarity
    df['Familiarity'] = (
        df['Familiarity']
        .astype(str)
        .str.replace('[A-Za-z]', '', regex=True)
        .str.strip()
    )

    # 6b) Split that cleaned Familiarity into two parts: first→Familiarity, second→Influence
    fam_split = df['Familiarity'].str.split(n=1, expand=True)
    df['Familiarity'] = fam_split[0]
    df['Influence']   = fam_split[1]

    # 7) Clean Medals - Specialist Audiences to keep only digits and dashes
    df['Medals - Specialist Audiences'] = (
        df['Medals - Specialist Audiences']
        .astype(str)
        .str.replace('[^0-9-]', '', regex=True)
    )

    return df

# Define the desired column names
target_cols = [
    'Rank', 'Junk1', 'Nation', 'Region', 'Index Score', 'Familiarity', 'Influence',
    'Reputation', 'Junk2','Junk3','Junk4','Business & Trade', 'Governance', 'International Relations',
    'Culture & Heritage', 'Media & Communication', 'Education & Science',
    'People & Values', 'Medals - General Public', 'Medals - Specialist Audiences'
]


cleaned2020 = [
    clean_soft_power_table(dfs2020 [i], nation_col_index=2, new_cols=target_cols)
    for i in (0, 1)
]

# 1. Combine into a single DataFrame
combined2020 = pd.concat(cleaned2020, ignore_index=True)

# 2. Drop any column whose name contains "junk" (case-insensitive)
junk_cols = [col for col in combined2020.columns if 'junk' in col.lower()]
combined2020 = combined2020.drop(columns=junk_cols)

# 3. Convert Rank column to integer
combined2020['Rank'] = pd.to_numeric(combined2020['Rank'], errors='raise').astype(int)

# 4. fix dash
combined2020['Medals - Specialist Audiences'] = (
combined2020['Medals - Specialist Audiences']
.replace('-1', '-')
)
    

In [23]:
combined2020

Unnamed: 0,Rank,Nation,Region,Index Score,Familiarity,Influence,Reputation,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Medals - General Public,Medals - Specialist Audiences
0,1,United States,North America,67.1,8.6,7.7,7.1,6.6,4.3,6.2,5.5,5.2,6.6.1,3.9,10,2
1,2,Germany,Europe,61.9,7.9,6.0,7.4,6.8,5.5,6.4,5.4,5.0,5.9,4.2,11,1
2,3,United Kingdom,Europe,61.8,8.2,6.3,7.3,5.8,4.7,6.3,6.0,5.1,5.5,4.5,5,3
3,4,Japan,Asia,60.2,7.6,5.8,7.5,6.9,5.4,5.1,5.7,4.1,6.5,4.7,8,1
4,5,China,Asia,58.7,7.6,7.1,6.5,6.6,3.4,5.4,5.2,2.9,5.1,2.8,2,1
5,6,France,Europe,58.5,8.1,5.8,7.1,5.8,4.4,5.8,6.4,4.7,4.2,4.3,4,2
6,7,Canada,North America,54.5,7.4,5.0,7.5,5.7,5.2,4.9,4.7,4.6,4.3,5.1,13,-
7,8,Switzerland,Europe,54.5,6.9,4.7,7.6,6.7,5.6,5.0,4.8,4.5,4.8,4.8,10,2
8,9,Sweden,Europe,51.9,6.4,4.2,7.4,6.2,5.3,4.6,5.1,4.3,5.0,5.0,11,1
9,10,Russia,Europe,51.0,7.2,5.7,6.3,4.0,3.1,5.6,4.9,3.0,4.1,3.5,2,-


### 2021 ###

In [24]:
dfs2021 = tabula.read_pdf("https://static.brandirectory.com/reports/brand-finance-global-soft-power-index-2021.pdf", pages='57', stream=True, multiple_tables=True)

In [25]:
def pdf_url_to_docx_in_memory(pdf_url: str,
                              start_page: int = 0,
                              end_page: int = None) -> Document:
    """
    Fetch a PDF from a URL and convert pages [start_page…end_page] into
    a python-docx Document, without permanently saving anything.
    """
    # 1. Download PDF bytes
    r = requests.get(pdf_url)
    r.raise_for_status()
    pdf_bytes = r.content

    # 2. Write to a temp .pdf file
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f_pdf:
        f_pdf.write(pdf_bytes)
        pdf_path = f_pdf.name

    # 3. Prepare a temp .docx path
    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f_docx:
        docx_path = f_docx.name

    # 4. Convert PDF → DOCX
    cv = Converter(pdf_path)
    cv.convert(docx_path, start=start_page, end=end_page)
    cv.close()

    # 5. Load into python-docx
    doc = Document(docx_path)

    # 6. Clean up temp files
    os.remove(pdf_path)
    os.remove(docx_path)

    return doc

# Usage:
url = "https://static.brandirectory.com/reports/brand-finance-global-soft-power-index-2021.pdf"
# zero-based pages 56–60 → PDF pages 57–61
doc = pdf_url_to_docx_in_memory(url, start_page=56, end_page=60)

# Now you have a `doc` object you can inspect:
for i, table in enumerate(doc.tables, 1):
    print(f"Table {i}: {len(table.rows)} rows")


[INFO] Start to convert C:\Users\igrom\AppData\Local\Temp\tmp5_rg1hzc.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/4) Page 57
[INFO] (2/4) Page 58
[INFO] (3/4) Page 59
[INFO] (4/4) Page 60
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/4) Page 57
[INFO] (2/4) Page 58
[INFO] (3/4) Page 59
[INFO] (4/4) Page 60
[INFO] Terminated in 222.49s.


Table 1: 3 rows
Table 2: 50 rows
Table 3: 50 rows
Table 4: 5 rows
Table 5: 34 rows
Table 6: 33 rows
Table 7: 18 rows


In [26]:

all_tables = doc.tables

start, end = 1, 3

dfs = []
for tbl in all_tables[start:end]:
    data = [[cell.text.strip() for cell in row.cells] for row in tbl.rows]
    df = pd.DataFrame(data)
    dfs.append(df)

combined = pd.concat(dfs, ignore_index=True)

dfs1 = []
start1, end1 = 3, 4

for tbl in all_tables[start1:end1]:
    data = [[cell.text.strip() for cell in row.cells] for row in tbl.rows]
    df1 = pd.DataFrame(data)
    dfs1.append(df1)

combined1 = pd.concat(dfs1, ignore_index=True)

dfs2 = []
start2, end2 = 4, 7

for tbl in all_tables[start2:end2]:
    data = [[cell.text.strip() for cell in row.cells] for row in tbl.rows]
    df2 = pd.DataFrame(data)
    dfs2.append(df2)

combined2 = pd.concat(dfs2, ignore_index=True)
# combined.to_csv("combined.csv")



In [27]:
df_side_by_side = pd.concat([dfs[0].reset_index(drop=True),
                             dfs[1].reset_index(drop=True)],
                            axis=1)

In [28]:
def tidy_soft_power_2021_0(df: pd.DataFrame) -> pd.DataFrame:
    """
    1. Rename all columns to the user’s master list (including Junk1, Junk2, etc.)
    2. Drop duplicate Nation rows (keeping the first occurrence)
    3. Drop any column whose name contains 'junk' (case‐insensitive)
    """
    df = df.copy()

    # 1. apply your full header roster
    df.columns = [
        'Rank 2021',    'Rank 2020',    'Junk1',   'Junk2',
        'Nation',       'Region',       'Index Score 2021',
        'Index Score Change', 'Index Score 2020',
        'Junk3',        'Familiarity',  'Reputation',
        'Influence',    'Business & Trade', 'Governance',
        'International Relations', 'Culture & Heritage',
        'Media & Communication', 'Education & Science',
        'People & Values', 'COVID-19 Response', 'Medals'
    ]

    # 2. drop duplicate Nation rows
    df = df.drop_duplicates(subset=['Nation'], keep='first').reset_index(drop=True)

    # 3. drop any column with 'junk' in its name
    good_cols = [c for c in df.columns if 'junk' not in c.lower()]
    df = df[good_cols]

    return df

cleaned_2021_0 = tidy_soft_power_2021_0(df_side_by_side)

In [29]:
def tidy_soft_power_2021_1(df: pd.DataFrame) -> pd.DataFrame:
    """
    1. Rename all columns to the user’s master list (including Junk1, Junk2, etc.)
    2. Drop duplicate Nation rows (keeping the first occurrence)
    3. Drop any column whose name contains 'junk' (case‐insensitive)
    """
    df = df.copy()

    # 1. apply your full header roster
    df.columns = [
        'Junk0', 'Rank 2021',  'Rank 2020',  'Junk1',   'Junk2',  
        'Nation',       'Region',       'Index Score 2021',
        'Index Score Change', 'Index Score 2020',
        'Junk3', 'Junk4',       'Familiarity',  'Reputation',
        'Influence',    'Business & Trade', 'Governance',
        'International Relations', 'Culture & Heritage',
        'Media & Communication', 'Education & Science',
        'People & Values', 'COVID-19 Response', 'Medals',   'Junk5'
    ]

    # 2. drop duplicate Nation rows
    df = df.drop_duplicates(subset=['Nation'], keep='first').reset_index(drop=True)

    # 3. drop any column with 'junk' in its name
    good_cols = [c for c in df.columns if 'junk' not in c.lower()]
    df = df[good_cols]

    return df

cleaned_2021_1 = tidy_soft_power_2021_1(combined1)

In [30]:
def tidy_soft_power_2021_2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rename columns (including Junk placeholders), drop all 'Junk' columns,
    and remove the last row.

    Parameters
    ----------
    df : pandas.DataFrame
        The raw DataFrame with 25 columns including Junk placeholders.

    Returns
    -------
    pandas.DataFrame
        The cleaned DataFrame with only the meaningful columns and
        without the footer row.
    """
    # 1. Rename all columns (including the “Junk” placeholders)
    df = df.copy()
    df.columns = [
        'Rank 2021', 'Rank 2020', 'Junk1', 'Junk2', 'Junk3',
        'Nation', 'Region', 'Index Score 2021', 'Index Score Change',
        'Index Score 2020', 'Junk4', 'Junk5', 'Familiarity', 'Reputation',
        'Influence', 'Business & Trade', 'Governance',
        'International Relations', 'Culture & Heritage',
        'Media & Communication', 'Education & Science',
        'People & Values', 'COVID-19 Response', 'Medals', 'Junk6'
    ]

    # 2. Drop any column whose name contains “junk” (case-insensitive)
    df = df.loc[:, ~df.columns.str.lower().str.contains('junk')]

    # 3. Remove the very last row and the first two
    df = df.iloc[2:]
    df = df.iloc[:-1]

    # 4. Reset the index
    df = df.reset_index(drop=True)

    return df

cleaned_2021_2 = tidy_soft_power_2021_2(combined2)


In [31]:
combined_df = pd.concat([cleaned_2021_0, cleaned_2021_1, cleaned_2021_2], ignore_index=True)

In [32]:
combined_df = combined_df[pd.to_numeric(combined_df['Rank 2021'], errors='coerce').notna()].reset_index(drop=True)


In [33]:
combined2021 = combined_df[~combined_df['Nation'].str.contains(r'\d', regex=True, na=False)].reset_index(drop=True)


In [34]:
# Strip any extra space around Nation
combined2021['Nation'] = combined2021['Nation'].astype(str).str.strip()

In [35]:
combined2021

Unnamed: 0,Rank 2021,Rank 2020,Nation,Region,Index Score 2021,Index Score Change,Index Score 2020,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
0,1,2,Germany,Europe,62.2,+0.3,61.9,7.8,7.5,6.0,6.8,5.8,6.7,5.2,4.9,6.6,4.2,5.7,16
1,2,4,Japan,Asia,60.6,+0.4,60.2,7.5,7.3,5.6,7.0,5.3,5.7,5.2,4.1,7.5,4.5,5.8,12
2,3,3,United Kingdom,Europe,57.9,-3.9,61.8,8.0,7.3,6.1,6.0,5.3,6.4,5.7,5.4,5.7,4.3,3.7,15
3,4,7,Canada,North America,57.2,+2.7,54.5,7.4,7.5,5.1,6.2,5.8,5.7,4.7,4.6,5.0,5.2,5.6,16
4,5,8,Switzerland,Europe,56.3,+1.8,54.5,6.8,7.6,4.8,6.2,5.9,5.7,4.6,4.5,5.1,4.7,5.8,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,101,-,Guatemala,Latin America & Caribbean,26.1,-,-,3.1,4.9,2.7,1.9,1.6,1.6,1.8,1.7,1.7,2.3,2.3,-
101,102,-,Honduras,Latin America & Caribbean,26.0,-,-,2.9,4.9,2.7,1.9,1.5,1.5,1.8,1.8,1.6,2.3,2.5,-
102,103,-,Trinidad and Tobago,Latin America & Caribbean,25.6,-,-,2.5,5.0,2.6,1.6,1.3,1.4,1.8,1.2,1.3,2.2,2.9,-
103,104,-,Mozambique,Sub-Saharan Africa,25.4,-,-,2.7,4.8,2.6,2.1,1.9,1.7,1.9,1.5,1.6,2.3,2.2,-


### 2022 ###

In [36]:
dfs2022 = tabula.read_pdf("https://static.brandirectory.com/reports/brand-finance-soft-power-index-2022.pdf", pages='60-63', stream=True, multiple_tables=True)

In [37]:
# define the PDF, page and area coordinates
pdf_path = "https://static.brandirectory.com/reports/brand-finance-soft-power-index-2022.pdf"
page_number = "60-63"


dfs2022 = []
area = [100,  25, 900, 1500]  
for page in range(60, 64):      # pages 60...63
    tables = tabula.read_pdf(
        pdf_path,
        pages=page,
        area=area,
        guess=False,
        stream=True,
        multiple_tables=False
    )
    # always one table in list
    dfs2022.append(tables[0])

In [38]:
dfs2022[0].iloc[2]

Unnamed: 0               NaN
Unnamed: 1               1 6
Unnamed: 2                 2
Unnamed: 3     United States
Unnamed: 4     North America
Unnamed: 5              70.7
Unnamed: 6              14.8
Unnamed: 7               NaN
Unnamed: 8              55.9
Unnamed: 9                 >
Rank 2021      9.2  7.1  7.7
Unnamed: 11              7.5
Unnamed: 12              4.8
Unnamed: 13              7.1
Unnamed: 14              5.9
Unnamed: 15              6.8
Unnamed: 16              8.6
Unnamed: 17              3.9
Unnamed: 18              4.8
Unnamed: 19               14
Unnamed: 20              NaN
Unnamed: 21              NaN
Name: 2, dtype: object

In [39]:
def clean_df_2022_page_0(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and restructures the DataFrame by performing the following steps:

    1. Drops the columns 'Unnamed: 0', 'Unnamed: 2', 'Unnamed: 7', 'Unnamed: 9', 'Unnamed: 20', 'Unnamed: 21'.
    2. Removes any rows where 'Unnamed: 4' is NaN.
    3. Removes '2022' from 'Unnamed: 1', strips out all but digits, dots, and whitespace,
       then splits on the first whitespace into two parts. Each part is reduced to its first
       numeric token and inserted in place of 'Unnamed: 1' as 'U1_part1' and 'U1_part2'.
    4. Removes '2022' from 'Rank 2021', strips out all but digits, dots, and whitespace,
       then splits into three parts and inserts them as 'R21_part1', 'R21_part2', 'R21_part3'
       in place of 'Rank 2021'.
    5. Renames all columns to the supplied header list.
    6. Strips leading/trailing spaces from 'Nation Brand' and removes any special characters.
    7. Returns the transformed DataFrame.
    """
    df = df.copy()

    # 1. Drop unwanted columns
    to_drop = [
        'Unnamed: 0', 'Unnamed: 2', 'Unnamed: 7',
        'Unnamed: 9', 'Unnamed: 20', 'Unnamed: 21'
    ]
    df.drop(columns=to_drop, errors='ignore', inplace=True)

    # 2. Remove rows where Unnamed: 4 is NaN
    df = df[df['Unnamed: 4'].notna()]

    # 3. Clean & split Unnamed: 1
    u1_idx = df.columns.get_loc('Unnamed: 1')
    u1_series = (
        df['Unnamed: 1']
        .astype(str)
        .str.replace('2022', '', regex=False)
        .str.replace(r'[^0-9\.\s]', '', regex=True)
    )
    raw_u1 = u1_series.str.strip().str.split(r'\s+', n=1, expand=True)
    first_num = r'(\d+(?:\.\d+)?)'
    part1 = raw_u1[0].str.extract(first_num)[0]
    part2 = raw_u1[1].str.extract(first_num)[0]
    df.drop(columns=['Unnamed: 1'], inplace=True)
    df.insert(u1_idx, 'U1_part2', part2)
    df.insert(u1_idx, 'U1_part1', part1)

    # 4. Clean & split Rank 2021
    r_idx = df.columns.get_loc('Rank 2021')
    r_series = (
        df['Rank 2021']
        .astype(str)
        .str.replace('2022', '', regex=False)
        .str.replace(r'[^0-9\.\s]', '', regex=True)
    )
    raw_r = r_series.str.strip().str.split(r'\s+', n=2, expand=True)
    r_part1 = raw_r[0].str.extract(first_num)[0]
    r_part2 = raw_r[1].str.extract(first_num)[0]
    r_part3 = raw_r[2].str.extract(first_num)[0]
    df.drop(columns=['Rank 2021'], inplace=True)
    df.insert(r_idx, 'R21_part3', r_part3)
    df.insert(r_idx, 'R21_part2', r_part2)
    df.insert(r_idx, 'R21_part1', r_part1)

    # 5. Rename all columns to the correct headers
    correct_headers = [
        'Rank 2022', 'Rank 2021', 'Nation Brand', 'Region',
        'Index Score 2022', 'Index Score Change', 'Index Score 2021',
        'Familiarity', 'Reputation', 'Influence', 'Business & Trade',
        'Governance', 'International Relations', 'Culture & Heritage',
        'Media & Communication', 'Education & Science',
        'People & Values', 'COVID-19 Response', 'Medals'
    ]
    if len(df.columns) != len(correct_headers):
        raise ValueError(f"Column count mismatch: expected {len(correct_headers)}, got {len(df.columns)}")
    df.columns = correct_headers

    # 6. Clean Nation Brand: strip whitespace and remove special characters
    df['Nation Brand'] = (
        df['Nation Brand']
        .astype(str)
        .str.strip()
        .str.replace(r'[^A-Za-z0-9\s]', '', regex=True)
    )

    return df

In [40]:
df_2022_page_0 = clean_df_2022_page_0(dfs2022[0])
df_2022_page_0

Unnamed: 0,Rank 2022,Rank 2021,Nation Brand,Region,Index Score 2022,Index Score Change,Index Score 2021,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
2,1,6,United States,North America,70.7,14.8,55.9,9.2,7.1,7.7,7.5,4.8,7.1,5.9,6.8,8.6,3.9,4.8,14
3,2,3,United Kingdom,Europe,64.9,7.0,57.9,8.7,7.3,6.5,6.4,5.3,6.5,5.5,6.2,6.7,4.0,5.5,10
4,3,1,Germany,Europe,64.6,2.4,62.2,8.2,7.4,6.0,7.1,5.7,6.8,5.1,5.4,7.5,4.2,6.5,12
5,4,8,China,Asia,64.2,9.9,54.3,8.4,6.4,7.3,7.7,3.5,5.6,4.6,4.4,8.0,3.1,5.4,7
6,5,2,Japan,Asia,63.5,2.9,60.6,8.2,7.3,5.9,7.5,5.1,5.8,5.2,4.7,8.6,4.4,6.2,8
7,6,7,France,Europe,60.6,5.3,55.4,8.5,7.1,5.9,6.3,4.4,6.1,5.9,5.4,5.5,4.1,5.5,6
8,7,4,Canada,North America,59.5,2.3,57.2,8.0,7.4,5.3,6.2,5.4,6.0,4.4,5.2,5.7,4.8,6.4,17
9,8,5,Switzerland,Europe,56.6,0.3,56.3,7.3,7.4,4.7,6.4,5.5,5.7,4.3,4.8,5.5,4.5,6.5,14
10,9,13,Russia,Europe,56.1,5.5,50.5,7.8,6.4,5.9,5.0,3.8,5.6,4.2,4.3,6.1,3.2,5.3,1
11,10,19,Italy,Europe,54.7,6.4,48.3,8.3,6.9,5.2,5.8,3.6,4.8,5.9,4.6,4.4,4.4,4.6,5


In [41]:
def clean_df_2022_page_1(df: pd.DataFrame) -> pd.DataFrame:
    """
    1. Drops columns 'Unnamed: 0', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 9',
       'Unnamed: 14', 'Unnamed: 17', 'Unnamed: 20', 'Unnamed: 21'.
    2. Filters out rows where 'Unnamed: 5' is NaN.
    3. For each of ['Unnamed: 1', 'Rank 2021', 'Unnamed: 12', 'Unnamed: 15', 'Unnamed: 18']:
       a. Removes '2022' and any non-digit/dot/space characters.
       b. Splits on first whitespace into two parts.
       c. Extracts the first numeric token from each part (or preserves a single dash),
          discarding extras and printing what was dropped.
       d. Reports missing splits or failed extractions.
       e. Replaces the original column with '<col>_1' and '<col>_2' in-place.
    4. After splitting:
       – Ensures any missing second part of 'Unnamed: 1' becomes '-'.
       – Hard‑codes 'Rank 2021_1'/'Rank 2021_2' to '5.0' and '5.8' only for Hungary.
       – Hard‑codes 'Unnamed: 1_1'/'Unnamed: 1_2' to '108' and '90' only for Myanmar.
       – In 'Unnamed: 8', replaces "New" with '-'.
    5. Renames all columns to the final header list.
    """
    df = df.copy()

    # 1. Drop unwanted columns
    df.drop(columns=[
        'Unnamed: 0','Unnamed: 2','Unnamed: 3','Unnamed: 9',
        'Unnamed: 14','Unnamed: 17','Unnamed: 20','Unnamed: 21'
    ], errors='ignore', inplace=True)

    # 2. Remove rows where Unnamed: 5 is NaN
    df = df[df['Unnamed: 5'].notna()]

    # 3. Clean & split specified columns
    cols = ['Unnamed: 1','Rank 2021','Unnamed: 12','Unnamed: 15','Unnamed: 18']
    num_re = re.compile(r'(\d+(?:\.\d+)?)')

    for col in cols:
        pos = df.columns.get_loc(col)
        s = (df[col].astype(str)
               .str.replace('2022','',regex=False)
               .str.replace(r'[^0-9\.\s\-]','',regex=True)
            )
        raw = s.fillna('').str.strip().str.split(r'\s+', n=1, expand=True)

        # report missing second part
        missing = raw[1].eq('')
        if missing.any():
            print(f"{col} missing second split at rows {list(raw[missing].index)}")

        # extract numeric or preserve dash
        p1 = raw[0].str.extract(num_re)[0].mask(raw[0].eq('-'), '-')
        p2 = raw[1].str.extract(num_re)[0].mask(raw[1].eq('-'), '-')

        # report extras
        extras = raw.astype(str).apply(
            lambda col: col.str.split().apply(lambda toks: toks[1:] if len(toks)>1 else [])
        )
        for sub in extras:
            for i, ex in extras[sub][extras[sub].astype(bool)].items():
                print(f"{col}, row {i}: dropped extra tokens {ex}")

        df.drop(columns=[col], inplace=True)
        df.insert(pos, f"{col}_2", p2)
        df.insert(pos, f"{col}_1", p1)

    # 4a. Fill missing 'Unnamed: 1_2' with dash
    if 'Unnamed: 1_2' in df:
        df['Unnamed: 1_2'] = df['Unnamed: 1_2'].fillna('-')

    # 4b. Hard-code Rank 2021 for Hungary
    if {'Rank 2021_1','Rank 2021_2','Unnamed: 4'}.issubset(df.columns):
        hun_idx = df.index[df['Unnamed: 4']=='Hungary']
        for i in hun_idx:
            df.loc[i, 'Rank 2021_1'] = '5.0'
            df.loc[i, 'Rank 2021_2'] = '5.8'

    # 4c. Hard-code Unnamed:1 for Myanmar
    if {'Unnamed: 1_1','Unnamed: 1_2','Unnamed: 4'}.issubset(df.columns):
        my_idx = df.index[df['Unnamed: 4']=='Myanmar']
        for i in my_idx:
            df.loc[i, 'Unnamed: 1_1'] = '108'
            df.loc[i, 'Unnamed: 1_2'] = '90'

    # 4d. Replace "New" with dash in Unnamed: 8
    if 'Unnamed: 8' in df.columns:
        df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.replace('New','-',regex=False)

    # 5. Rename to final headers
    final_headers = [
        'Rank 2022', 'Rank 2021', 'Nation Brand', 'Region',
        'Index Score 2022', 'Index Score Change', 'Index Score 2021',
        'Familiarity', 'Reputation', 'Influence', 'Business & Trade',
        'Governance', 'International Relations', 'Culture & Heritage',
        'Media & Communication', 'Education & Science',
        'People & Values', 'COVID-19 Response', 'Medals'
    ]
    if len(df.columns) != len(final_headers):
        raise ValueError(f"Expected {len(final_headers)} columns but got {len(df.columns)}")
    df.columns = final_headers

    return df

In [42]:
df_2022_page_1 = clean_df_2022_page_1(dfs2022[1])
df_2022_page_1

Unnamed: 1, row 22: dropped extra tokens ['2']


Unnamed: 0,Rank 2022,Rank 2021,Nation Brand,Region,Index Score 2022,Index Score Change,Index Score 2021,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
2,31,34,Egypt,Middle East & North Africa,41.6,+3.3,38.3,7.1,5.8,4.0,3.5,2.2,3.3,3.9,3.2,3.0,3.1,3.9,1
3,32,28,Portugal,Europe,41.0,+0.2,40.8,6.5,6.3,3.9,3.1,2.8,3.0,3.3,3.0,2.6,3.1,4.5,-
4,33,31,Greece,Europe,40.4,+0.6,39.8,6.7,6.2,3.8,3.2,2.6,2.9,3.8,2.7,2.7,3.3,4.1,-
5,34,37,South Africa,Sub-Saharan Africa,40.2,+3.1,37.2,6.4,5.7,3.9,4.0,2.5,3.3,3.2,3.3,3.0,3.0,3.6,-
6,35,33,Thailand,Asia,40.2,+1.5,38.7,6.2,5.9,3.7,4.0,2.5,2.8,3.7,2.9,3.0,3.6,4.3,-
7,36,42,Kuwait,Middle East & North Africa,39.1,+3.3,35.8,4.7,5.9,3.9,3.8,3.1,3.6,2.4,2.6,2.5,2.7,5.0,-
8,37,30,Iceland,Europe,38.6,-1.3,39.9,5.2,6.4,3.2,3.8,3.6,3.3,2.7,3.1,3.2,3.2,4.4,-
9,38,41,Argentina,Latin America & Caribbean,38.5,+2.5,36.1,6.6,5.9,3.7,3.1,2.1,2.7,3.3,2.9,2.7,2.9,3.7,-
10,39,39,Malaysia,Asia,38.5,+1.5,36.9,5.5,6.0,3.6,3.8,2.7,3.0,2.5,2.7,3.0,3.0,4.4,-
11,40,38,Poland,Europe,38.2,+1.2,37.0,5.7,6.1,3.6,3.4,2.6,2.8,2.4,2.8,2.8,2.8,4.4,-


In [43]:
df_2022_page_2 = clean_df_2022_page_1(dfs2022[2])
df_2022_page_2

Unnamed: 1, row 22: dropped extra tokens ['1']


Unnamed: 0,Rank 2022,Rank 2021,Nation Brand,Region,Index Score 2022,Index Score Change,Index Score 2021,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
2,61,73,Dominican Republic,Latin America & Caribbean,32.9,+2.0,30.9,4.0,5.4,2.9,3.2,2.3,2.8,2.8,2.8,2.5,3.2,3.8,-
3,62,66,Bulgaria,Europe,32.9,+1.3,31.6,4.5,5.6,3.0,2.8,2.1,2.3,1.9,2.5,2.6,2.5,4.3,-
4,63,62,Iran,Middle East & North Africa,32.7,+0.9,31.9,5.8,5.1,3.7,2.3,1.8,2.3,1.8,1.9,2.1,1.6,2.7,-
5,64,46,Slovenia,Europe,32.6,-1.5,34.2,3.8,5.7,2.9,2.9,2.3,2.7,2.4,2.8,2.5,2.6,3.9,-
6,65,-,Malta,Europe,32.5,-,-,3.4,5.6,2.8,2.6,2.8,2.7,3.3,2.3,2.1,3.0,4.4,-
7,66,56,Uruguay,Latin America & Caribbean,32.3,-0.8,33.1,4.5,5.5,3.0,2.9,2.2,2.6,2.4,2.4,2.2,2.8,3.4,-
8,67,68,Costa Rica,Latin America & Caribbean,32.1,+0.7,31.4,4.2,5.6,3.0,2.7,2.0,2.4,2.2,2.4,2.4,2.7,3.9,-
9,68,65,Bahrain,Middle East & North Africa,32.0,+0.3,31.7,3.7,5.7,3.2,2.8,2.1,2.3,2.0,2.1,2.2,2.3,3.9,-
10,69,82,Nigeria,Sub-Saharan Africa,32.0,+2.8,29.2,5.1,5.1,3.2,2.6,1.8,2.2,2.5,2.3,2.2,2.5,3.2,-
11,70,77,Jamaica,Latin America & Caribbean,32.0,+1.8,30.2,4.9,5.3,2.9,2.6,1.8,2.0,3.4,2.5,2.1,3.3,3.2,-


In [44]:
df_2022_page_3 = clean_df_2022_page_1(dfs2022[3])
df_2022_page_3

Unnamed: 1, row 22: dropped extra tokens ['90', '1']


Unnamed: 0,Rank 2022,Rank 2021,Nation Brand,Region,Index Score 2022,Index Score Change,Index Score 2021,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
2,91,54,Estonia,Europe,29.9,-3.5,33.4,3.2,5.6,2.9,2.7,2.1,2.3,1.8,2.3,2.3,2.2,3.4,-
3,92,85,Bolivia,Latin America & Caribbean,29.9,+0.9,29.0,4.0,5.2,2.8,2.5,1.7,2.3,1.9,2.4,2.0,2.3,3.7,-
4,93,-,Barbados,Latin America & Caribbean,29.7,-,-,2.7,5.5,2.6,2.7,2.6,2.3,3.3,1.9,2.0,3.1,3.7,-
5,94,-,Madagascar,Sub-Saharan Africa,29.6,-,-,3.9,5.5,2.6,2.3,1.9,2.3,2.7,2.0,1.3,3.1,3.8,-
6,95,96,Kenya,Sub-Saharan Africa,29.5,+1.8,27.7,4.7,5.2,3.0,2.4,1.8,2.1,2.0,2.0,1.9,2.2,2.5,-
7,96,86,Cote dIvoire,Sub-Saharan Africa,29.4,+0.5,28.9,3.5,5.2,2.8,2.6,2.1,2.2,2.4,2.1,2.0,2.4,3.3,-
8,97,-,Montenegro,Europe,29.3,-,-,3.0,5.3,2.8,2.4,1.8,2.2,2.7,2.4,1.8,2.3,3.7,-
9,98,91,Ecuador,Latin America & Caribbean,29.3,+1.0,28.3,4.1,5.3,2.9,2.2,1.6,2.1,1.8,2.1,2.1,2.4,3.1,-
10,99,80,Latvia,Europe,29.3,-0.5,29.8,3.3,5.4,2.8,2.5,1.9,2.2,1.6,2.3,2.1,2.0,3.7,-
11,100,89,Cambodia,Asia,29.3,+0.7,28.5,4.1,5.3,2.7,2.6,2.0,2.1,2.1,2.0,1.9,2.3,3.3,-


In [45]:
combined2022 = pd.concat([df_2022_page_0, df_2022_page_1, df_2022_page_2, df_2022_page_3], 
                     axis=0,            # stack rows
                     ignore_index=True) # reset the row index

In [46]:
combined2022

Unnamed: 0,Rank 2022,Rank 2021,Nation Brand,Region,Index Score 2022,Index Score Change,Index Score 2021,Familiarity,Reputation,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,COVID-19 Response,Medals
0,1,6,United States,North America,70.7,14.8,55.9,9.2,7.1,7.7,7.5,4.8,7.1,5.9,6.8,8.6,3.9,4.8,14
1,2,3,United Kingdom,Europe,64.9,7.0,57.9,8.7,7.3,6.5,6.4,5.3,6.5,5.5,6.2,6.7,4.0,5.5,10
2,3,1,Germany,Europe,64.6,2.4,62.2,8.2,7.4,6.0,7.1,5.7,6.8,5.1,5.4,7.5,4.2,6.5,12
3,4,8,China,Asia,64.2,9.9,54.3,8.4,6.4,7.3,7.7,3.5,5.6,4.6,4.4,8.0,3.1,5.4,7
4,5,2,Japan,Asia,63.5,2.9,60.6,8.2,7.3,5.9,7.5,5.1,5.8,5.2,4.7,8.6,4.4,6.2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,105,Dem. Rep. Congo,Sub-Saharan Africa,27.1,+1.9,25.2,3.7,5.1,2.8,2.1,1.4,1.8,1.7,1.9,1.9,1.9,2.7,-
116,117,104,Mozambique,Sub-Saharan Africa,26.5,+1.1,25.4,3.4,4.9,2.6,2.5,1.6,1.8,1.7,1.7,1.6,2.4,3.0,-
117,118,102,Honduras,Latin America & Caribbean,26.5,+0.5,26.0,3.3,5.0,2.7,2.0,1.4,1.5,1.5,1.9,1.7,1.9,3.2,-
118,119,-,Sudan,Sub-Saharan Africa,26.0,-,-,4.4,5.0,2.9,1.9,0.9,1.6,1.8,1.5,1.0,2.3,1.9,-


### 2023 ###

In [47]:
# define the PDF, page and area coordinates
pdf_path = "https://static.brandirectory.com/reports/brand-finance-soft-power-index-2023-digital.pdf"
page_number = "81-84"


dfs = []
area = [100,  25, 900, 1500]  
for page in range(81, 85):      # pages 81, 82, 83, 84
    tables = tabula.read_pdf(
        pdf_path,
        pages=page,
        area=area,
        guess=False,
        stream=True,
        multiple_tables=False
    )
    # always one table in list
    dfs.append(tables[0])

In [48]:
def clean_soft_power_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 0. override row 27 of Unnamed:2
    df.at[27, 'Unnamed: 2'] = 'Turkey'

    # 0b. strip all letters from Unnamed:1 and keep only the first integer
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )

    # 1. drop rows missing Unnamed:2
    df = df[df['Unnamed: 2'].notna()]

    # 2. drop unwanted columns (including Unnamed:11)
    df.drop(columns=[
        'Unnamed: 3',
        'Unnamed: 11',
        'Unnamed: 13',
        'Unnamed: 15',
        'Unnamed: 18',
        'Unnamed: 20'
    ], inplace=True)

    # 3. strip $ and commas from Unnamed:8 & Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 4. strip % from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 5. split & insert in place
    splits = [
        ('Nation',      'Familiarity',        'Reputation'),
        ('Unnamed: 14', 'Influence',          'Business & Trade'),
        ('Unnamed: 16', 'Governance',         'International Relations'),
        ('Unnamed: 19', 'Culture & Heritage', 'Media & Communication'),
    ]
    for old, col1, col2 in splits:
        split_df = (
            df[old]
              .astype(str)
              .str.split(r'\s+', n=1, expand=True)
              .iloc[:, :2]
        )
        left_series  = pd.to_numeric(split_df.iloc[:, 0], errors='coerce')
        right_series = pd.to_numeric(split_df.iloc[:, 1], errors='coerce')

        idx = df.columns.get_loc(old)
        df.insert(idx,     col1, left_series)
        df.insert(idx + 1, col2, right_series)
        df.drop(columns=[old], inplace=True)

    # 6. rename all columns to final schema
    df.columns = [
        'Rank 2023',
        'Rank 2022',
        'Nation Brand',
        'Region',
        'Index Score 2023',
        'Index Score Change',
        'Index Score 2022',
        'Brand Value 2023 (USD bn)',
        'Brand Value Change',
        'Brand Value 2022 (USD bn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'Governance',
        'International Relations',
        'Culture & Heritage',
        'Media & Communication',
        'Education & Science',
        'People & Values',
        'Sustainability',
        'Medals'
    ]

    # 7. override row 27 in the renamed 'Rank 2022' column to 22
    df.at[27, 'Rank 2022'] = 22
    
    return df

In [49]:
df_2023_0 = clean_soft_power_df(dfs[0])

In [50]:
df_2023_0.iloc[0]

Rank 2023                              1.0
Rank 2022                              1.0
Nation Brand                 United States
Region                       North America
Index Score 2023                      74.8
Index Score Change                     4.1
Index Score 2022                      70.7
Brand Value 2023 (USD bn)            30309
Brand Value Change                     +14
Brand Value 2022 (USD bn)            26472
Familiarity                            9.3
Reputation                             7.1
Influence                              7.5
Business & Trade                       7.9
Governance                             5.9
International Relations                8.3
Culture & Heritage                     6.9
Media & Communication                  6.5
Education & Science                    8.4
People & Values                        4.6
Sustainability                         6.9
Medals                                  16
Name: 1, dtype: object

In [51]:
df_2023_0

Unnamed: 0,Rank 2023,Rank 2022,Nation Brand,Region,Index Score 2023,Index Score Change,Index Score 2022,Brand Value 2023 (USD bn),Brand Value Change,Brand Value 2022 (USD bn),...,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Sustainability,Medals
1,1.0,1.0,United States,North America,74.8,4.1,70.7,30309,14,26472,...,7.5,7.9,5.9,8.3,6.9,6.5,8.4,4.6,6.9,16
2,2.0,2.0,United Kingdom,Europe,67.3,2.4,64.9,4797,17,4115,...,6.5,6.9,6.3,7.7,6.5,6.0,5.5,4.8,6.8,11
3,3.0,3.0,Germany,Europe,65.8,1.2,64.6,5076,13,4504,...,6.0,7.4,7.7,,6.0,5.2,5.9,4.8,7.4,11
4,4.0,5.0,Japan,Asia,65.2,1.7,63.5,4449,4,4284,...,5.8,7.7,6.1,6.6,6.0,4.6,7.0,5.0,7.3,8
5,5.0,4.0,China,Asia,65.0,0.8,64.2,23085,7,21528,...,7.1,7.7,4.4,6.5,5.3,4.0,6.9,3.6,5.7,6
6,6.0,6.0,France,Europe,62.4,1.8,60.6,3670,16,3152,...,5.7,6.8,5.4,7.1,7.0,5.1,4.7,4.7,6.5,5
7,7.0,7.0,Canada,North America,60.7,1.2,59.5,2621,17,2238,...,5.3,6.8,6.6,,5.2,5.1,4.6,5.6,7.0,13
8,8.0,8.0,Switzerland,Europe,58.5,2.0,56.6,1032,3,1006,...,4.9,7.0,6.5,,5.3,4.7,4.4,5.2,7.2,13
10,9.0,10.0,Italy,Europe,56.6,1.8,54.7,2345,9,2155,...,5.1,6.3,4.5,5.8,6.9,4.5,3.5,5.1,5.6,6
11,10.0,15.0,United Arab Emirates,MENA,55.2,3.2,52.0,957,24,773,...,5.5,6.5,5.2,6.1,4.6,4.2,4.1,4.4,5.9,2


In [52]:
def process_page2(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 1. fix a specific cell in Unnamed:2
    df.at[27, 'Unnamed: 2'] = 'Jordan'

    # 2. move row 17’s values from 'Nation' onward into row 15, then clear row 17
    start = df.columns.get_loc('Nation')
    cols_to_move = df.columns[start:]
    df.loc[15, cols_to_move] = df.loc[17, cols_to_move]
    df.loc[17, cols_to_move] = pd.NA

    # 3. keep only the first integer in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )

    # 4. drop Unnamed:3 and Unnamed:11
    df.drop(columns=['Unnamed: 3', 'Unnamed: 11', 'Unnamed: 23', 'Unnamed: 24'], inplace=True)

    # 5. strip % from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 6. strip $ from Unnamed:8 and Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'\$', '', regex=True)

    # 7. drop rows with NaN in Unnamed:2 and reset index
    df = df[df['Unnamed: 2'].notna()].reset_index(drop=True)

    # 8. split the numeric‐pair column "Nation" in-place,
    #    printing warnings for any failures alongside the country name.
    pair_col = 'Nation'
    orig_pairs = df[pair_col].astype(str).copy()
    countries  = df['Unnamed: 2'].copy()
    bad_rows   = []

    # prepare two empty Series
    n1 = pd.Series(index=df.index, dtype="Float64")
    n2 = pd.Series(index=df.index, dtype="Float64")

    # 8a. do the split
    for i, text in orig_pairs.items():
        cleaned = re.sub(r'[^0-9.\s]', '', text).strip()
        parts  = cleaned.split()
        if len(parts)==2 and all(re.fullmatch(r'\d+(\.\d+)?', p) for p in parts):
            n1.at[i] = float(parts[0])
            n2.at[i] = float(parts[1])
        else:
            bad_rows.append(i)

    # 8b. override the four known bad rows with correct values
    overrides = {
        11: (6.5, 6.1),
        13: (7.0, 5.7),
        15: (3.5, 6.0),
        17: (5.0, 5.9),
    }
    for idx, (v1, v2) in overrides.items():
        if idx in n1.index:
            n1.at[idx] = v1
            n2.at[idx] = v2
            if idx in bad_rows:
                bad_rows.remove(idx)

    # 8c. insert the two new columns in place of "Nation"
    idx = df.columns.get_loc(pair_col)
    df.drop(columns=[pair_col], inplace=True)
    df.insert(idx, f'{pair_col}_1', n1)
    df.insert(idx+1, f'{pair_col}_2', n2)

    # 8d. report any remaining split failures
    for i in bad_rows:
        print(
            f"Warning: row {i}, country={countries.at[i]!r}, "
            f"{pair_col}={orig_pairs.at[i]!r} did not match 'num num'"
        )

    # 9. rename all columns to final schema
    df.columns = [
        'Rank 2023',
        'Rank 2022',
        'Nation Brand',
        'Region',
        'Index Score 2023',
        'Index Score Change',
        'Index Score 2022',
        'Brand Value 2023 (USD bn)',
        'Brand Value Change',
        'Brand Value 2022 (USD bn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'Governance',
        'International Relations',
        'Culture & Heritage',
        'Media & Communication',
        'Education & Science',
        'People & Values',
        'Sustainability',
        'Medals'
    ]


    return df


In [53]:
df_2023_1 = process_page2(dfs[1])
df_2023_1

Unnamed: 0,Rank 2023,Rank 2022,Nation Brand,Region,Index Score 2023,Index Score Change,Index Score 2022,Brand Value 2023 (USD bn),Brand Value Change,Brand Value 2022 (USD bn),...,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Sustainability,Medals
0,31.0,28.0,Brazil,LATAM & Caribbean,46.2,2.9,43.4,825,10,753,...,4.4,4.6,2.9,3.9,5.4,3.5,2.6,4.4,4.3,2
1,32.0,41.0,Luxembourg,Europe,45.3,7.7,37.6,129,15,112,...,3.9,5.2,5.2,4.9,4.0,4.0,3.4,4.2,5.7,-
2,33.0,40.0,Poland,Europe,45.1,6.9,38.2,790,11,714,...,4.0,4.9,4.0,4.6,3.9,3.7,3.1,4.3,5.1,-
3,34.0,37.0,Iceland,Europe,44.7,6.2,38.6,30,14,27,...,3.4,4.9,5.2,4.4,4.1,4.0,3.3,4.7,6.4,1
4,35.0,36.0,Kuwait,MENA,44.3,5.2,39.1,149,3,144,...,4.5,5.0,4.0,4.7,3.5,3.5,3.0,4.0,4.7,-
5,36.0,33.0,Greece,Europe,44.3,3.9,40.4,119,10,108,...,3.7,4.4,3.6,4.0,5.5,3.5,2.7,4.6,4.7,1
6,37.0,51.0,Ukraine,Europe,44.3,10.1,34.2,83,-3,85,...,4.7,3.8,3.4,3.9,3.0,3.5,2.7,3.5,3.8,1
7,38.0,31.0,Egypt,MENA,43.0,1.5,41.6,225,5,214,...,4.1,4.0,2.9,3.9,4.6,3.3,2.6,3.6,4.0,1
8,39.0,39.0,Malaysia,Asia,42.6,4.1,38.5,512,3,499,...,4.1,4.9,3.6,3.7,3.6,3.5,3.0,3.9,4.6,-
9,40.0,34.0,South Africa,Sub-Saharan Africa,42.5,2.3,40.2,225,4,216,...,4.1,4.4,3.1,3.9,4.0,3.4,2.6,3.6,4.3,


In [54]:
dfs[2].iloc[1]

Unnamed: 0              61
Unnamed: 1              60
Unnamed: 2               1
Unnamed: 3     Philippines
Unnamed: 4            Asia
Unnamed: 5            38.7
Unnamed: 6             5.4
Unnamed: 7            33.2
Unnamed: 8            $526
Unnamed: 9             +9%
Unnamed: 10           $481
Unnamed: 11              *
Nation            5.7  5.7
Unnamed: 13            3.6
Unnamed: 14            4.0
Unnamed: 15            3.0
Unnamed: 16            3.3
Unnamed: 17            3.6
Unnamed: 18            3.1
Unnamed: 19            2.4
Unnamed: 20            4.0
Unnamed: 21            3.8
Unnamed: 22              -
Unnamed: 23            NaN
Unnamed: 24            NaN
Name: 1, dtype: object

In [55]:
dfs[2]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,,,,,,,,,,...,,,,,,,,,,
1,61,60,1,Philippines,Asia,38.7,5.4,33.2,$526,+9%,...,3.0,3.3,3.6,3.1,2.4,4.0,3.8,-,,
2,62,84,2,Slovakia,Europe,38.7,7.7,30.9,$133,+18%,...,3.6,3.7,3.2,3.4,2.7,3.7,4.7,-,,
3,63,66,2,Uruguay,LATAM & Caribbean,38.4,6.2,32.3,$58,+17%,...,3.3,3.6,3.7,3.4,2.7,4.0,4.4,-,,
4,64,50,1,Romania,Europe,38.4,4.0,34.4,$210,+10%,...,3.2,3.5,3.4,3.2,2.7,3.7,4.2,-,,
5,65,99,2,Latvia,Europe,38.4,9.1,29.3,$44,+9%,...,3.5,3.7,3.2,3.6,2.9,3.9,4.8,-,,
6,66,53,1,Cuba,LATAM & Caribbean,38.3,4.3,34.0,$43,-,...,3.0,3.3,3.6,3.0,2.7,3.9,3.5,-,,
7,67,71,2,Mauritius,Sub-Saharan Africa,38.0,6.1,31.9,$11,+1%,...,3.6,3.6,3.7,3.6,2.8,4.2,4.5,-,,
8,68,58,1,Cyprus,Europe,37.8,4.5,33.3,$27,+17%,...,3.4,3.5,3.7,3.4,2.6,4.1,4.3,-,,
9,,,,,,,,,,,...,,,,,,,,,,


In [56]:
def process_page3(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the attached DataFrame according to your six steps:

    0. Sets Unnamed:3 at row 25 to "Tunisia"    
    1. Drops Unnamed:11, Unnamed:23, Unnamed:24, Unnamed:2
    2. Strips '%' from Unnamed:9
    3. Strips '$' from Unnamed:8 and Unnamed:10
    4. Drops any row where Unnamed:3 is NaN
    5. Splits 'Nation' into two numeric columns in place, warning on failures
    6. Reduces Unnamed:1 to its first integer if it contains extra text
    7. Manual overrides for certain 'Nation' splits and 
    8. Numeric‐only in Unnamed:0.
    """
    df = df.copy()

    # 0. override row 25 of Unnamed:3
    df.at[25, 'Unnamed: 3'] = 'Tunisia'    
    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 11', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 2'],
            inplace=True)

    # 2. strip '%' from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 3. strip '$' from Unnamed:8 & Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'\$', '', regex=True)

    # 4. drop rows missing Unnamed:3
    df = df[df['Unnamed: 3'].notna()].reset_index(drop=True)

    # 5. split the 'Nation' column into two numeric parts, with overrides
    orig = df['Nation'].astype(str).copy()
    bad = []
    part1 = pd.Series(index=df.index, dtype="Float64")
    part2 = pd.Series(index=df.index, dtype="Float64")

    # initial parse
    for i, txt in orig.items():
        cleaned = re.sub(r'[^0-9.\s]', '', txt).strip()
        tokens = cleaned.split()
        if len(tokens) == 2 and all(re.fullmatch(r'\d+(\.\d+)?', t) for t in tokens):
            part1.at[i], part2.at[i] = float(tokens[0]), float(tokens[1])
        else:
            bad.append(i)

    # manual overrides for the four problematic rows
    overrides = {
        11: (4.0, 6.0),
        13: (2.5, 5.9),
        15: (4.7, 5.6),
        17: (2.9, 5.8),
    }
    for idx, (v1, v2) in overrides.items():
        if idx in part1.index:
            part1.at[idx] = v1
            part2.at[idx] = v2
            if idx in bad:
                bad.remove(idx)

    # insert the two new columns in place of 'Nation'
    idx = df.columns.get_loc('Nation')
    df.drop(columns=['Nation'], inplace=True)
    df.insert(idx,     'Nation_1', part1)
    df.insert(idx + 1, 'Nation_2', part2)

    # warn about any remaining failures
    for i in bad:
        print(f"Warning: row {i}, Nation={orig.at[i]!r} could not be parsed as two numbers")

    # 6. reduce Unnamed:1 to its first integer
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )

    # 7. reduce Unnamed:0 to its first integer
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )

    # 8. rename all columns to final schema
    df.columns = [
        'Rank 2023',
        'Rank 2022',
        'Nation Brand',
        'Region',
        'Index Score 2023',
        'Index Score Change',
        'Index Score 2022',
        'Brand Value 2023 (USD bn)',
        'Brand Value Change',
        'Brand Value 2022 (USD bn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'Governance',
        'International Relations',
        'Culture & Heritage',
        'Media & Communication',
        'Education & Science',
        'People & Values',
        'Sustainability',
        'Medals'
    ]

    return df


In [57]:
df_2023_2 = process_page3(dfs[2])
df_2023_2

Unnamed: 0,Rank 2023,Rank 2022,Nation Brand,Region,Index Score 2023,Index Score Change,Index Score 2022,Brand Value 2023 (USD bn),Brand Value Change,Brand Value 2022 (USD bn),...,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Sustainability,Medals
0,61.0,60.0,Philippines,Asia,38.7,5.4,33.2,526,+9,481,...,3.6,4.0,3.0,3.3,3.6,3.1,2.4,4.0,3.8,-
1,62.0,84.0,Slovakia,Europe,38.7,7.7,30.9,133,+18,114,...,3.6,4.0,3.6,3.7,3.2,3.4,2.7,3.7,4.7,-
2,63.0,66.0,Uruguay,LATAM & Caribbean,38.4,6.2,32.3,58,+17,49,...,3.4,3.9,3.3,3.6,3.7,3.4,2.7,4.0,4.4,-
3,64.0,50.0,Romania,Europe,38.4,4.0,34.4,210,+10,191,...,3.5,4.0,3.2,3.5,3.4,3.2,2.7,3.7,4.2,-
4,65.0,99.0,Latvia,Europe,38.4,9.1,29.3,44,+9,40,...,3.6,4.2,3.5,3.7,3.2,3.6,2.9,3.9,4.8,-
5,66.0,53.0,Cuba,LATAM & Caribbean,38.3,4.3,34.0,43,-,-,...,3.7,3.5,3.0,3.3,3.6,3.0,2.7,3.9,3.5,-
6,67.0,71.0,Mauritius,Sub-Saharan Africa,38.0,6.1,31.9,11,+1,11,...,3.5,4.2,3.6,3.6,3.7,3.6,2.8,4.2,4.5,-
7,68.0,58.0,Cyprus,Europe,37.8,4.5,33.3,27,+17,23,...,3.4,4.2,3.4,3.5,3.7,3.4,2.6,4.1,4.3,-
8,69.0,59.0,Vietnam,Asia,37.8,4.4,33.3,498,+16,431,...,3.5,4.2,2.8,3.2,3.3,2.9,2.5,3.5,4.0,-
9,70.0,54.0,Panama,LATAM & Caribbean,37.7,3.8,33.9,61,+0,60,...,3.6,4.3,3.0,3.6,3.2,3.2,2.5,3.5,4.2,


In [58]:
dfs[3].iloc[1]

Unnamed: 0              91.0
Unnamed: 1           102 2 H
Unnamed: 2             Nepal
Unnamed: 3               NaN
Unnamed: 4              Asia
Unnamed: 5              35.6
Unnamed: 6              +6.5
Unnamed: 7              29.1
Unnamed: 8               $29
Unnamed: 9              +17%
Unnamed: 10              $25
Unnamed: 11                g
Nation         4.2  5.7  3.2
Unnamed: 13              NaN
Unnamed: 14              3.5
Unnamed: 15              2.9
Unnamed: 16              3.0
Unnamed: 17              3.5
Unnamed: 18              3.0
Unnamed: 19              2.4
Unnamed: 20              4.0
Unnamed: 21              3.9
Unnamed: 22                -
Unnamed: 23              NaN
Name: 1, dtype: object

In [59]:
def process_page4(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame according to your steps, with:
    
    0. In row 28, set Unnamed:2 to "Mozambique"
    1. Drop Unnamed:3, Unnamed:11, Unnamed:23, Unnamed:13
    2. Strip '%' from Unnamed:9
    3. Strip '$' (and commas) from Unnamed:8 and Unnamed:10
    4. Drop rows where Unnamed:2 is NaN
    5. Reduce Unnamed:1 to its first integer
    5b. Replace any Unnamed:1 value below 20 with '-'
    6. Split 'Nation' into three numeric columns after stripping non‑digit/dot
    """
    df = df.copy()

    # 0. override row 28’s country
    df.at[28, 'Unnamed: 2'] = 'Mozambique'

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 3', 'Unnamed: 11', 'Unnamed: 23', 'Unnamed: 13'],
            inplace=True)

    # 2. strip '%' from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 3. strip '$' & commas from Unnamed:8 & Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 4. drop rows missing Unnamed:2
    df = df[df['Unnamed: 2'].notna()].reset_index(drop=True)

    # 5. keep only the first integer in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )
    
    # 5b. replace any value below 20 with '-'
    df['Unnamed: 1'] = df['Unnamed: 1'].where(df['Unnamed: 1'] >= 20, '-')
    # 6. split 'Nation' into three numeric cols, collecting failures
    orig   = df['Nation'].astype(str).copy()
    idx    = df.columns.get_loc('Nation')
    bad_rows = []

    n1 = pd.Series(index=df.index, dtype="Float64")
    n2 = pd.Series(index=df.index, dtype="Float64")
    n3 = pd.Series(index=df.index, dtype="Float64")

    for i, txt in orig.items():
        cleaned = re.sub(r'[^0-9.\s]', '', txt).strip()
        parts  = cleaned.split()
        if len(parts)==3 and all(re.fullmatch(r'\d+(\.\d+)?', p) for p in parts):
            n1.at[i], n2.at[i], n3.at[i] = map(float, parts)
        else:
            bad_rows.append(i)

    # 6b. manual overrides for the seven problematic rows
    overrides = {
        8:  (4.9, 5.6, 3.1),
        12: (3.8, 5.4, 3.3),
        14: (3.5, 5.0, 3.4),
        16: (4.0, 5.2, 3.2),
        18: (3.9, 5.3, 3.2),
        24: (4.3, 5.2, 3.3),
        30: (3.9, 5.0, 3.1),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in n1.index:
            n1.at[i], n2.at[i], n3.at[i] = v1, v2, v3
            if i in bad_rows:
                bad_rows.remove(i)

    # drop the old column and insert the three new ones in its place
    df.drop(columns=['Nation'], inplace=True)
    df.insert(idx,     'Nation_1', n1)
    df.insert(idx + 1, 'Nation_2', n2)
    df.insert(idx + 2, 'Nation_3', n3)

    # any remaining failures?
    for i in bad_rows:
        print(f"Warning: row {i}, raw={orig.at[i]!r} could not be split cleanly")

    # 7. rename all columns to final schema
    df.columns = [
        'Rank 2023',
        'Rank 2022',
        'Nation Brand',
        'Region',
        'Index Score 2023',
        'Index Score Change',
        'Index Score 2022',
        'Brand Value 2023 (USD bn)',
        'Brand Value Change',
        'Brand Value 2022 (USD bn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'Governance',
        'International Relations',
        'Culture & Heritage',
        'Media & Communication',
        'Education & Science',
        'People & Values',
        'Sustainability',
        'Medals'
    ]

    return df

In [60]:
df_2023_3 = process_page4(dfs[3])
df_2023_3

Unnamed: 0,Rank 2023,Rank 2022,Nation Brand,Region,Index Score 2023,Index Score Change,Index Score 2022,Brand Value 2023 (USD bn),Brand Value Change,Brand Value 2022 (USD bn),...,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Sustainability,Medals
0,91.0,102.0,Nepal,Asia,35.6,+6.5,29.1,29,+17,25,...,3.2,3.5,2.9,3.0,3.5,3.0,2.4,4.0,3.9,-
1,92.0,86.0,Ghana,Sub-Saharan Africa,35.5,+5.2,30.3,63,+10,57,...,3.3,3.9,2.9,3.3,3.5,3.0,2.3,3.7,3.9,-
2,93.0,69.0,Nigeria,Sub-Saharan Africa,35.4,+3.4,32.0,265,+10,241,...,3.5,3.6,2.4,3.1,3.4,2.9,2.1,3.5,3.3,-
3,94.0,72.0,Bosnia and Herzegovina,Europe,35.3,+3.5,31.8,14,-,-,...,3.4,3.7,3.0,3.3,3.1,3.0,2.5,3.7,3.8,-
4,95.0,119.0,Sudan,Sub-Saharan Africa,35.3,+9.3,26.0,18,-,-,...,3.5,3.7,2.7,3.2,3.0,2.9,2.4,3.7,3.5,-
5,96.0,109.0,Botswana,Sub-Saharan Africa,35.2,+6.7,28.5,17,-,-,...,3.4,3.8,3.2,3.6,3.1,3.2,2.5,3.7,4.2,-
6,97.0,105.0,Bangladesh,Asia,35.1,+6.1,29.0,508,+37,371,...,3.5,3.8,2.5,3.0,2.7,2.8,2.3,3.2,3.5,-
7,98.0,101.0,Tanzania,Sub-Saharan Africa,35.1,+6.0,29.1,49,+18,41,...,3.4,3.7,3.0,3.3,3.2,3.0,2.3,3.6,3.9,-
8,99.0,70.0,Jamaica,LATAM & Caribbean,35.1,+3.1,32.0,10,+16,9,...,3.1,3.4,2.4,2.9,4.2,2.9,2.0,4.2,3.3,-
9,100.0,95.0,Kenya,Sub-Saharan Africa,35.0,+5.5,29.5,83,+3,80,...,3.4,3.6,2.6,3.0,3.3,2.8,2.2,3.5,3.6,-


In [61]:
combined2023 = pd.concat([df_2023_0, df_2023_1, df_2023_2, df_2023_3], 
                     axis=0,            # stack rows
                     ignore_index=True) # reset the row index

In [62]:
combined2023

Unnamed: 0,Rank 2023,Rank 2022,Nation Brand,Region,Index Score 2023,Index Score Change,Index Score 2022,Brand Value 2023 (USD bn),Brand Value Change,Brand Value 2022 (USD bn),...,Influence,Business & Trade,Governance,International Relations,Culture & Heritage,Media & Communication,Education & Science,People & Values,Sustainability,Medals
0,1.0,1.0,United States,North America,74.8,4.1,70.7,30309,+14,26472,...,7.5,7.9,5.9,8.3,6.9,6.5,8.4,4.6,6.9,16
1,2.0,2.0,United Kingdom,Europe,67.3,2.4,64.9,4797,+17,4115,...,6.5,6.9,6.3,7.7,6.5,6.0,5.5,4.8,6.8,11
2,3.0,3.0,Germany,Europe,65.8,1.2,64.6,5076,+13,4504,...,6.0,7.4,7.7,,6.0,5.2,5.9,4.8,7.4,11
3,4.0,5.0,Japan,Asia,65.2,1.7,63.5,4449,+4,4284,...,5.8,7.7,6.1,6.6,6.0,4.6,7.0,5.0,7.3,8
4,5.0,4.0,China,Asia,65.0,0.8,64.2,23085,+7,21528,...,7.1,7.7,4.4,6.5,5.3,4.0,6.9,3.6,5.7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,117.0,106.0,Laos,Asia,33.6,+4.8,28.9,12,-,-,...,3.3,3.6,2.7,2.9,2.9,3.0,2.6,3.4,3.7,-
117,118.0,120.0,Trinidad and Tobago,LATAM & Caribbean,33.5,+8.2,25.3,15,-,-,...,3.2,3.5,2.8,3.1,3.1,3.2,2.6,3.7,3.7,-
118,119.0,115.0,Uganda,Sub-Saharan Africa,33.4,+6.0,27.3,32,+22,26,...,3.2,3.5,2.7,2.9,2.9,2.9,2.3,3.5,3.6,-
119,120.0,111.0,Guatemala,LATAM & Caribbean,33.3,+5.1,28.2,60,+12,53,...,3.2,3.3,2.6,3.0,3.0,3.1,2.4,3.3,3.8,-


### 2024 ###

In [63]:
dfs2024 = tabula.read_pdf("https://static.brandirectory.com/reports/brand-finance-soft-power-index-2024-digital.pdf", pages='32-39', stream=True, multiple_tables=True)

In [64]:
# define the PDF, page and area coordinates
pdf_path = "https://static.brandirectory.com/reports/brand-finance-soft-power-index-2024-digital.pdf"
page_number = "32-29"


dfs2024 = []
area = [100,  25, 900, 1500]  
for page in range(32, 40): 
    tables = tabula.read_pdf(
        pdf_path,
        pages=page,
        area=area,
        guess=False,
        stream=True,
        multiple_tables=False
    )
    # always one table in list
    dfs2024.append(tables[0])

In [65]:
dfs2024[0]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,1.0,1 0 É,,USA,North America,78.8,+4.0,74.8,"$32,271,140",+6.5%,...,,7.4 6.1,8.7,,6.6 4.4,,8.4,6.1,17,
1,2.0,2 0 Ê,,United Kingdom,Europe,71.8,+4.5,67.3,"$4,036,790",-15.8%,...,,6.9 6.6,8.3,,5.9 4.9,,6.2,6.2,12,
2,3.0,5 2 Ë,,China,Asia,71.2,+6.2,65.0,"$19,960,020",-13.5%,...,,5.7 4.8,7.1,,4.4 3.6,,7.6,5.5,8,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,4 0 Ì,,Japan,Asia,70.6,+5.4,65.2,"$4,406,090",-1.0%,...,,6.8 6.5,7.0,,4.7 5.5,,7.9,7.1,13,
5,5.0,3 1 Í,,Germany,Europe,69.8,+4.0,65.8,"$4,985,350",-1.8%,...,,5.8 6.6,7.9,,5.0 4.8,,6.6,6.8,12,
6,6.0,6 0 Î,,France,Europe,67.3,+4.9,62.4,"$3,522,360",-4.0%,...,,7.9 5.4,7.4,,5.0 4.6,,5.0,5.7,4,
7,7.0,7 0 Ï,,Canada,North America,64.4,+3.7,60.7,"$2,670,820",+1.9%,...,,5.0 6.4,6.9,,4.8 5.7,,5.1,6.3,17,
8,,,,,,,,,,,...,,,,,,,,,,
9,8.0,8 0 Ð,,Switzerland,Europe,62.9,+4.4,58.5,"$1,065,370",+3.3%,...,,5.1 6.6,6.6,,4.3 5.5,,5.0,6.5,18,


In [66]:
def clean_df_2024_page_0(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2024 soft‑power DataFrame:
    
    1. Drop rows where Unnamed:4 is NaN
    2. In Unnamed:1 keep only the first integer
    3. Remove $ (and commas) from Unnamed:8 & Unnamed:10
    4. Remove % from Unnamed:9
    5. Drop Unnamed:23, Unnamed:11, Unnamed:2, Unnamed:14, Unnamed:19, Unnamed:17
    6. After that override row 18:
         Unnamed:3 → "Denmark", Unnamed:2 → 18.0
    7. Clean 'Nation Brand' by stripping non‑digit/dot then dropping “2022”/“2023”
    8. Split each of these four columns into two numeric cols in place:
         'Nation Brand', 'Unnamed: 15', 'Unnamed: 13', 'Unnamed: 18'
       reporting any that fail to split into exactly two floats.
    9. Apply manual overrides for the three remaining bad 'Nation Brand' rows.
   10. Drop the final column.
    """
    df = df.copy()

    # 1. drop rows missing Unnamed:4
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 2. keep first integer in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+)', expand=False)
          .astype(float)
    )

    # 3. strip $ and commas from Unnamed:8 & Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 4. strip % from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 5. drop unwanted columns
    df.drop(columns=[
        'Unnamed: 23',
        'Unnamed: 11',
        'Unnamed: 2',
        'Unnamed: 14',
        'Unnamed: 19',
        'Unnamed: 17'
    ], inplace=True)

    # 6. override row 18 now that rows have been dropped
    df.at[18, 'Unnamed: 3'] = 'Denmark'
    df.at[18, 'Unnamed: 2'] = 18.0

    # 7. clean 'Nation Brand'
    def clean_nb(cell):
        s = re.sub(r'[^0-9.\s]', '', str(cell))
        tokens = [t for t in s.split() if t not in ('2022','2023')]
        return ' '.join(tokens)
    df['Nation Brand'] = df['Nation Brand'].apply(clean_nb)

    # 8. split four columns into two numeric parts in place
    split_cols = ['Nation Brand', 'Unnamed: 15', 'Unnamed: 13', 'Unnamed: 18']
    bad = []
    for col in split_cols:
        orig = df[col].astype(str).copy()
        parts = orig.str.split(r'\s+', n=1, expand=True)
        left  = pd.to_numeric(parts[0], errors='coerce')
        right = pd.to_numeric(parts[1], errors='coerce')

        # record failures
        mask_bad = orig.ne('') & (left.isna() | right.isna())
        for i in df.index[mask_bad]:
            bad.append((col, i, orig.at[i]))

        # insert in place
        idx = df.columns.get_loc(col)
        df.insert(idx,     f'{col}_1', left)
        df.insert(idx + 1, f'{col}_2', right)
        df.drop(columns=[col], inplace=True)

    # 9. manual overrides for the three problematic 'Nation Brand' rows
    overrides = {
        11: (7.0, 4.9),
        12: (7.8, 5.0),
        24: (7.3, 5.5),
    }
    for i, (v1, v2) in overrides.items():
        if i in df.index:
            df.at[i, 'Nation Brand_1'] = v1
            df.at[i, 'Nation Brand_2'] = v2
            # remove from bad list if present
            bad = [b for b in bad if not (b[0]=='Nation Brand' and b[1]==i)]

    # report any remaining split failures
    for col, i, val in bad:
        print(f"Warning: row {i}, column {col!r} could not split: {val!r}")

    # 10. drop the final column
    df.drop(columns=[df.columns[-1]], inplace=True)

    # 11. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [67]:
df_2024_page_0 = clean_df_2024_page_0(dfs2024[0])
df_2024_page_0

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,1.0,1.0,USA,North America,78.8,4.0,74.8,32271140,6.5,30309110,...,7.3,8.9,7.4,6.1,8.7,6.6,4.4,8.4,6.1,17
1,2.0,2.0,United Kingdom,Europe,71.8,4.5,67.3,4036790,-15.8,4796830,...,7.6,8.3,6.9,6.6,8.3,5.9,4.9,6.2,6.2,12
2,3.0,5.0,China,Asia,71.2,6.2,65.0,19960020,-13.5,23085110,...,6.8,8.8,5.7,4.8,7.1,4.4,3.6,7.6,5.5,8
3,4.0,4.0,Japan,Asia,70.6,5.4,65.2,4406090,-1.0,4448780,...,7.7,8.9,6.8,6.5,7.0,4.7,5.5,7.9,7.1,13
4,5.0,3.0,Germany,Europe,69.8,4.0,65.8,4985350,-1.8,5075970,...,7.5,8.5,5.8,6.6,7.9,5.0,4.8,6.6,6.8,12
5,6.0,6.0,France,Europe,67.3,4.9,62.4,3522360,-4.0,3669880,...,7.1,8.0,7.9,5.4,7.4,5.0,4.6,5.0,5.7,4
6,7.0,7.0,Canada,North America,64.4,3.7,60.7,2670820,1.9,2621270,...,7.7,7.8,5.0,6.4,6.9,4.8,5.7,5.1,6.3,17
7,8.0,8.0,Switzerland,Europe,62.9,4.4,58.5,1065370,3.3,1031600,...,7.8,8.2,5.1,6.6,6.6,4.3,5.5,5.0,6.5,18
8,9.0,9.0,Italy,Europe,62.0,5.4,56.6,2326270,-0.8,2345220,...,7.3,7.4,7.9,4.5,6.0,4.5,5.3,4.0,5.1,6
9,10.0,10.0,United Arab Emirates,MENA,59.7,4.5,55.2,1061770,10.9,957250,...,7.1,7.7,4.5,5.3,6.3,4.2,4.5,4.5,5.5,3


In [68]:
dfs2024[1].iloc[1]

Unnamed: 0                 27.0
Unnamed: 1      30 2 ü Portugal
Unnamed: 2                  NaN
Unnamed: 3               Europe
Unnamed: 4                 50.1
Unnamed: 5                 +3.5
Unnamed: 6                 46.6
Unnamed: 7             $264,410
Unnamed: 8                -2.0%
Unnamed: 9             $269,860
Unnamed: 10                 NaN
Nation Brand        7.1 4.6 6.9
Unnamed: 12                 5.4
Unnamed: 13                 5.2
Unnamed: 14                 4.1
Unnamed: 15                 4.5
Unnamed: 16                 3.7
Unnamed: 17                 4.7
Unnamed: 18                 3.2
Unnamed: 19                 4.3
Unnamed: 20                   -
Unnamed: 21                 NaN
Unnamed: 22                 NaN
Name: 1, dtype: object

In [69]:
dfs2024[1]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,26.0,26 0 û New Zealand,,Oceania,52.5,+2.7,49.8,"$250,210",-10.6%,"$279,940",...,4.5,5.2,4.8,3.9,5.2,3.9,5.6,-,,
1,27.0,30 2 ü Portugal,,Europe,50.1,+3.5,46.6,"$264,410",-2.0%,"$269,860",...,5.2,4.1,4.5,3.7,4.7,3.2,4.3,-,,
2,28.0,29 2 ý Ireland,,Europe,49.9,+3.2,46.7,"$854,400",+2.6%,"$832,800",...,4.7,4.6,4.6,3.6,4.9,3.6,5.0,-,,
3,,,,,,,,,,,...,,,,,,,,,,
4,29.0,28 1 India,,Asia,49.8,+2.8,47.0,"$2,944,480",+0.7%,"$2,923,950",...,5.7,2.9,4.3,3.2,3.4,4.2,3.0,-,,
5,30.0,32 2 Luxembourg,,Europe,49.0,+3.7,45.3,"$126,120",-2.0%,"$128,730",...,3.9,5.3,4.9,3.6,4.6,3.9,5.3,-,,
6,31.0,31 0 Brazil,,LATAM & Caribbean,48.8,+2.6,46.2,"$911,210",+10.5%,"$824,800",...,5.5,2.9,4.0,3.5,4.3,2.7,3.6,2,,
7,32.0,27 1 Israel,,MENA,48.7,+0.3,48.4,"$414,380",-0.5%,"$416,390",...,3.4,3.5,5.0,3.7,3.1,4.3,3.6,-,,
8,,,,,,,,,,,...,,,,,,,,,,
9,33.0,33 0 Poland,,Europe,48.6,+3.5,45.1,"$863,580",+9.3%,"$790,350",...,4.0,4.1,5.0,3.6,4.3,3.5,4.4,-,,


In [70]:
def clean_df_2024_page_1(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame according to your steps, with:
      1. Drop Unnamed:2,10,21,22
      2. Strip $ from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Split Unnamed:1 → Rank & Country (keep only first number; Country only A–Z letters)
      5. Drop rows with NaN in Unnamed:3
      6. After that override row 18: Country='Ukraine', Rank=37.0
      7. Split Nation Brand into three numeric columns (clean before split), with manual fixes
    """
    df = df.copy()

    # 1. Drop columns
    df.drop(columns=['Unnamed: 2', 'Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'],
            inplace=True)

    # 2. Remove $ from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. Remove % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. Split Unnamed:1 into Rank and Country
    idx1 = df.columns.get_loc('Unnamed: 1')
    # first numeric token → Rank
    rank_series = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+\.?\d*)', expand=False)
          .astype(float)
    )
    # remainder → Country, then strip digits/dots and non‑A–Z characters
    country_series = (
        df['Unnamed: 1']
          .astype(str)
          .str.replace(r'^\d+\.?\d*\s*', '', regex=True)      # remove leading number
          .str.replace(r'[\d\.]', '', regex=True)              # remove digits/dots
          .str.replace(r'[^A-Za-z\s]', '', regex=True)         # keep only A–Z letters & spaces
          .str.strip()
    )
    df.drop(columns=['Unnamed: 1'], inplace=True)
    df.insert(idx1,     'Rank',    rank_series)
    df.insert(idx1 + 1, 'Country', country_series)

    # 5. Drop rows with NaN in Unnamed:3
    df = df[df['Unnamed: 3'].notna()].reset_index(drop=True)

    # 6. Override row 18
    df.at[18, 'Country'] = 'Ukraine'
    df.at[18, 'Rank']    = 37.0

    # 7. Split 'Nation Brand' into three numeric parts
    orig_nb = df['Nation Brand'].astype(str).copy()
    # strip non‑digit/dot/space, then split
    cleaned = orig_nb.apply(lambda x: re.sub(r'[^0-9.\s]', '', x).strip())
    parts   = cleaned.str.split(r'\s+', expand=True)

    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # find rows that didn't yield exactly three valid numbers
    bad = [
        (i, orig_nb.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3 or any(not re.fullmatch(r'\d+(\.\d+)?', t) for t in toks)
    ]

    # drop old and insert new
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # manual fixes for known bad rows
    overrides_nb = {
        9:  (5.6, 4.4, 6.4),
        12: (7.3, 4.3, 6.4),
        14: (6.8, 4.2, 6.3),
        24: (5.7, 3.9, 5.9),
    }
    for i, (v1, v2, v3) in overrides_nb.items():
        if i in df.index:
            df.at[i, 'Nation_1'] = v1
            df.at[i, 'Nation_2'] = v2
            df.at[i, 'Nation_3'] = v3
            bad = [b for b in bad if b[0] != i]

    # report any remaining split failures
    for i, raw in bad:
        print(f"Warning: row {i}, Nation Brand={raw!r} did not split into three numeric parts")

    # 8. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df


In [71]:
df_2024_page_1 = clean_df_2024_page_1(dfs2024[1])
df_2024_page_1

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,26.0,26.0,New Zealand,Oceania,52.5,+2.7,49.8,250210,-10.6,279940,...,7.2,6.7,4.5,5.2,4.8,3.9,5.2,3.9,5.6,-
1,27.0,30.0,Portugal,Europe,50.1,+3.5,46.6,264410,-2.0,269860,...,6.9,5.4,5.2,4.1,4.5,3.7,4.7,3.2,4.3,-
2,28.0,29.0,Ireland,Europe,49.9,+3.2,46.7,854400,+2.6,832800,...,6.9,6.2,4.7,4.6,4.6,3.6,4.9,3.6,5.0,-
3,29.0,28.0,India,Asia,49.8,+2.8,47.0,2944480,+0.7,2923950,...,6.0,4.6,5.7,2.9,4.3,3.2,3.4,4.2,3.0,-
4,30.0,32.0,Luxembourg,Europe,49.0,+3.7,45.3,126120,-2.0,128730,...,7.0,6.8,3.9,5.3,4.9,3.6,4.6,3.9,5.3,-
5,31.0,31.0,Brazil,LATAM & Caribbean,48.8,+2.6,46.2,911210,+10.5,824800,...,6.6,4.6,5.5,2.9,4.0,3.5,4.3,2.7,3.6,2
6,32.0,27.0,Israel,MENA,48.7,+0.3,48.4,414380,-0.5,416390,...,5.5,4.9,3.4,3.5,5.0,3.7,3.1,4.3,3.6,-
7,33.0,33.0,Poland,Europe,48.6,+3.5,45.1,863580,+9.3,790350,...,6.5,5.4,4.0,4.1,5.0,3.6,4.3,3.5,4.4,-
8,34.0,34.0,Iceland,Europe,45.8,+1.1,44.7,32600,+8.2,30140,...,6.8,5.6,3.8,4.8,4.0,3.3,4.8,3.6,5.5,-
9,35.0,39.0,Malaysia,Asia,45.7,+3.1,42.6,517690,+1.2,511740,...,6.4,5.4,3.9,3.7,4.0,3.3,4.2,3.3,4.1,-


In [72]:
dfs2024[2].iloc[1]

Unnamed: 0                    52.0
Unnamed: 1      61 2 1 Philippines
Unnamed: 2                     NaN
Unnamed: 3                    Asia
Unnamed: 4                    39.8
Unnamed: 5                     1.1
Unnamed: 6                    38.7
Unnamed: 7                $525,620
Unnamed: 8                   -0.1%
Unnamed: 9                $526,250
Unnamed: 10                      I
Nation Brand               6.0 3.8
Unnamed: 12                    5.9
Unnamed: 13                    3.8
Unnamed: 14                    3.8
Unnamed: 15                    2.7
Unnamed: 16                    3.3
Unnamed: 17                    3.0
Unnamed: 18                    4.0
Unnamed: 19                    2.6
Unnamed: 20                    3.2
Unnamed: 21                      -
Unnamed: 22                    NaN
Name: 1, dtype: object

In [73]:
dfs2024[2]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,51.0,50 1 0 Bahrain,,MENA,40.0,0.0,40.0,"$27,450",+2.5%,"$26,780",...,5.0,3.1,3.6,4.0,3.2,3.9,2.8,3.5,-,
1,52.0,61 2 1 Philippines,,Asia,39.8,1.1,38.7,"$525,620",-0.1%,"$526,250",...,3.8,3.8,2.7,3.3,3.0,4.0,2.6,3.2,-,
2,53.0,69 2 2 Vietnam,,Asia,39.6,1.8,37.8,"$507,060",+1.8%,"$498,130",...,4.0,3.5,2.8,3.2,2.8,3.5,2.7,3.2,-,
3,,,,,,,,,,,...,,,,,,,,,,
4,54.0,49 1 3 Georgia,,Europe,39.3,-0.7,40.0,"$23,660",+9.1%,"$21,680",...,4.2,3.9,3.4,3.8,3.4,4.2,3.0,3.8,-,
5,55.0,59 2 4 Maldives,,Asia,39.2,0.3,38.9,"$3,650",+9.9%,"$3,320",...,4.1,4.0,2.9,2.9,3.1,4.4,2.4,3.7,1,
6,56.0,52 1 5 Slovenia,,Europe,39.0,-0.6,39.6,"$91,220",+1.5%,"$89,870",...,4.1,3.4,3.5,3.6,3.2,3.9,2.9,4.1,-,
7,57.0,51 1 6 Estonia,,Europe,38.9,-1.1,40.0,"$50,550",+5.3%,"$48,010",...,4.5,3.3,3.7,3.8,3.2,3.9,3.2,4.2,-,
8,,,,,,,,,,,...,,,,,,,,,,
9,58.0,64 2 7 Romania,,Europe,38.8,0.4,38.4,"$215,950",+2.8%,"$210,150",...,3.7,3.5,3.0,3.3,3.0,3.7,2.7,3.5,-,


In [74]:
def clean_df_2024_page_2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame according to your steps, with final overrides:
    
    1. Drop Unnamed:2, Unnamed:10, Unnamed:22
    2. Strip $ from Unnamed:7 & Unnamed:9
    3. Strip % from Unnamed:8
    4. Split Unnamed:1 into 'Rank' (first number) and 'Country' (letters only)
    5. Drop rows where Unnamed:3 is NaN
    6. After row‐drop, override row 18: Rank=58.0, Country='Colombia'
    7. Split 'Nation Brand' into two numeric parts, with manual fixes for rows 9,12,24
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 2', 'Unnamed: 10', 'Unnamed: 22'],
            inplace=True)

    # 2. remove $ from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. remove % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. split Unnamed:1 → Rank & Country
    idx1 = df.columns.get_loc('Unnamed: 1')
    # extract first numeric token
    rank = (df['Unnamed: 1']
            .astype(str)
            .str.extract(r'(\d+\.?\d*)', expand=False)
            .astype(float))
    # country = remainder, then keep only letters and spaces
    country = (df['Unnamed: 1']
               .astype(str)
               .str.replace(r'^\d+\.?\d*\s*', '', regex=True)     # remove leading number
               .str.replace(r'[^A-Za-z\s]', '', regex=True)       # keep only A–Z letters & spaces
               .str.strip())
    df.drop(columns=['Unnamed: 1'], inplace=True)
    df.insert(idx1,     'Rank',    rank)
    df.insert(idx1 + 1, 'Country', country)
    
    # remove any leading “X ” (letter + space) so “A Malta” → “Malta”, “C Costa Rica” → “Costa Rica”, etc.
    df['Country'] = df['Country'].str.replace(r'^[A-Za-z]\s+', '', regex=True)

    # 5. drop rows with NaN in Unnamed:3
    df = df[df['Unnamed: 3'].notna()].reset_index(drop=True)

    # 6. override row 18
    if 18 in df.index:
        df.at[18, 'Rank']    = 58.0
        df.at[18, 'Country'] = 'Colombia'

    # 7. split 'Nation Brand' into two numeric parts
    orig_nb = df['Nation Brand'].astype(str).copy()
    # strip out all non-digit/dot/space
    cleaned = orig_nb.apply(lambda x: re.sub(r'[^0-9.\s]', '', x).strip())
    parts = cleaned.str.split(r'\s+', n=1, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')

    # record failures
    bad = [ (i, orig_nb.iat[i])
            for i in df.index
            if pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i]) ]

    # manual overrides for rows 9, 12, 24
    overrides = {
        9:  (4.2, 3.6),
        12: (3.6, 3.7),
        24: (4.7, 3.6),
    }
    for i, (v1, v2) in overrides.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            bad = [b for b in bad if b[0] != i]

    # replace the original column with the two new ones
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)

    # report any remaining split failures
    for i, raw in bad:
        print(f"Warning: row {i}, Nation Brand={raw!r} could not split into two numbers")
    
    # 8. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [75]:
df_2024_page_2 = clean_df_2024_page_2(dfs2024[2])
df_2024_page_2

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,51.0,50.0,Bahrain,MENA,40.0,0.0,40.0,27450,2.5,26780,...,5.8,5.0,3.1,3.6,4.0,3.2,3.9,2.8,3.5,-
1,52.0,61.0,Philippines,Asia,39.8,1.1,38.7,525620,-0.1,526250,...,5.9,3.8,3.8,2.7,3.3,3.0,4.0,2.6,3.2,-
2,53.0,69.0,Vietnam,Asia,39.6,1.8,37.8,507060,1.8,498130,...,5.7,4.0,3.5,2.8,3.2,2.8,3.5,2.7,3.2,-
3,54.0,49.0,Georgia,Europe,39.3,-0.7,40.0,23660,9.1,21680,...,5.9,4.2,3.9,3.4,3.8,3.4,4.2,3.0,3.8,-
4,55.0,59.0,Maldives,Asia,39.2,0.3,38.9,3650,9.9,3320,...,6.6,4.1,4.0,2.9,2.9,3.1,4.4,2.4,3.7,1
5,56.0,52.0,Slovenia,Europe,39.0,-0.6,39.6,91220,1.5,89870,...,6.0,4.1,3.4,3.5,3.6,3.2,3.9,2.9,4.1,-
6,57.0,51.0,Estonia,Europe,38.9,-1.1,40.0,50550,5.3,48010,...,5.8,4.5,3.3,3.7,3.8,3.2,3.9,3.2,4.2,-
7,58.0,64.0,Romania,Europe,38.8,0.4,38.4,215950,2.8,210150,...,5.9,3.7,3.5,3.0,3.3,3.0,3.7,2.7,3.5,-
8,59.0,54.0,Chile,LATAM & Caribbean,38.8,-0.7,39.5,272150,0.9,269750,...,5.8,3.8,3.6,2.7,3.3,3.0,3.8,2.6,3.3,-
9,60.0,62.0,Slovakia,Europe,38.8,0.1,38.7,128560,-3.6,133420,...,5.9,4.1,3.4,3.3,3.6,3.1,3.9,2.9,3.8,-


In [76]:
dfs2024[3].iloc[1]

Unnamed: 0         77.0
Unnamed: 1       83 2 a
Unnamed: 2          NaN
Unnamed: 3      Tunisia
Unnamed: 4          NaN
Unnamed: 5         MENA
Unnamed: 6         36.6
Unnamed: 7         +0.2
Unnamed: 8         36.4
Unnamed: 9      $23,350
Unnamed: 10      -16.9%
Unnamed: 11     $28,100
Unnamed: 12           z
Nation Brand    4.5 3.6
Unnamed: 14         5.7
Unnamed: 15         3.4
Unnamed: 16         3.7
Unnamed: 17         2.7
Unnamed: 18         3.2
Unnamed: 19         3.0
Unnamed: 20         3.8
Unnamed: 21         2.4
Unnamed: 22         3.0
Unnamed: 23           -
Unnamed: 24         NaN
Name: 1, dtype: object

In [77]:
dfs2024[3]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,76.0,- 3,̀,North Korea,,Asia,36.7,-,-,-,...,3.4,2.3,3.0,3.3,2.6,2.1,3.1,2.7,-,
1,77.0,83 2 a,,Tunisia,,MENA,36.6,+0.2,36.4,"$23,350",...,3.4,3.7,2.7,3.2,3.0,3.8,2.4,3.0,-,
2,78.0,75 1 b,,Paraguay,,LATAM & Caribbean,36.3,-0.5,36.8,"$36,280",...,3.5,3.3,2.8,3.3,3.0,3.8,2.6,3.3,-,
3,,,,,,,,,,,...,,,,,,,,,,
4,79.0,93 2 c,,Nigeria,,Sub-Saharan Africa,36.3,+0.9,35.4,"$209,560",...,3.1,3.3,2.1,2.9,2.9,3.0,2.2,2.7,-,
5,80.0,79 1 d,,Kazakhstan,,Asia,35.9,-0.6,36.5,"$231,570",...,3.5,3.0,2.9,3.3,3.0,3.6,2.7,3.3,-,
6,81.0,84 2 e,,Pakistan,,Asia,35.6,-0.8,36.4,"$219,440",...,2.8,2.6,2.2,3.1,2.7,2.8,2.2,2.5,-,
7,82.0,- 3 f,,Liechtenstein,,Europe,35.5,-,-,"$8,790",...,5.1,2.7,4.1,3.5,2.8,3.7,2.8,4.1,-,
8,,,,,,,,,,,...,,,,,,,,,,
9,83.0,81 1 g,,Dominican Republic,,LATAM & Caribbean,35.5,-0.9,36.4,"$94,410",...,3.4,3.5,2.7,3.2,3.0,4.0,2.4,3.0,-,


In [78]:
def clean_df_2024_page_3(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame from dfs2024[3] according to:
      1. Drop Unnamed:2, Unnamed:4, Unnamed:12, Unnamed:24
      2. Strip $ from Unnamed:9 & Unnamed:11
      3. Strip % from Unnamed:10
      4. Keep only the first numeric token in Unnamed:1
      5. Drop rows where Unnamed:5 is NaN
      6. Override row 18's Unnamed:1 to 103
      7. Clean and split 'Nation Brand' into two numeric cols with manual fixes
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 12', 'Unnamed: 24'], inplace=True)

    # 2. remove $ (and commas) from Unnamed:9 & Unnamed:11
    for c in ['Unnamed: 9', 'Unnamed: 11']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:10
    df['Unnamed: 10'] = df['Unnamed: 10'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+\.?\d*)', expand=False)
          .astype(float)
    )

    # 5. drop rows missing Unnamed:5
    df = df[df['Unnamed: 5'].notna()].reset_index(drop=True)

    # 6. override row 18's rank to 103
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 103.0

    # 7. split 'Nation Brand' into two numeric parts
    orig = df['Nation Brand'].astype(str).copy()
    cleaned = orig.apply(lambda x: re.sub(r'[^0-9.\s]', '', x).strip())
    parts = cleaned.str.split(r'\s+', n=1, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')

    # record failures
    bad = [i for i in df.index if pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i])]

    # manual fixes for the four problematic rows
    for i, (v1, v2) in {9:(4.4,3.5), 12:(3.7,3.0), 14:(5.2,3.2), 24:(3.9,3.5)}.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            if i in bad: bad.remove(i)

    # replace the original column with the two new ones
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_Brand_1', nb1)
    df.insert(idx_nb + 1, 'Nation_Brand_2', nb2)

    # report any remaining split failures
    for i in bad:
        print(f"Warning: row {i}, Nation Brand={orig.iat[i]!r} did not split properly")

    # 8. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [79]:
df_2024_page_3 = clean_df_2024_page_3(dfs2024[3])
df_2024_page_3

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,76.0,3.0,North Korea,Asia,36.7,-,-,-,-,-,...,4.1,3.4,2.3,3.0,3.3,2.6,2.1,3.1,2.7,-
1,77.0,83.0,Tunisia,MENA,36.6,+0.2,36.4,23350,-16.9,28100,...,5.7,3.4,3.7,2.7,3.2,3.0,3.8,2.4,3.0,-
2,78.0,75.0,Paraguay,LATAM & Caribbean,36.3,-0.5,36.8,36280,+10.1,32960,...,5.6,3.5,3.3,2.8,3.3,3.0,3.8,2.6,3.3,-
3,79.0,93.0,Nigeria,Sub-Saharan Africa,36.3,+0.9,35.4,209560,-20.9,264910,...,5.3,3.1,3.3,2.1,2.9,2.9,3.0,2.2,2.7,-
4,80.0,79.0,Kazakhstan,Asia,35.9,-0.6,36.5,231570,+6.2,218130,...,5.4,3.5,3.0,2.9,3.3,3.0,3.6,2.7,3.3,-
5,81.0,84.0,Pakistan,Asia,35.6,-0.8,36.4,219440,-5.3,231730,...,4.8,2.8,2.6,2.2,3.1,2.7,2.8,2.2,2.5,-
6,82.0,3.0,Liechtenstein,Europe,35.5,-,-,8790,-,-,...,5.5,5.1,2.7,4.1,3.5,2.8,3.7,2.8,4.1,-
7,83.0,81.0,Dominican Republic,LATAM & Caribbean,35.5,-0.9,36.4,94410,+9.4,86320,...,5.7,3.4,3.5,2.7,3.2,3.0,4.0,2.4,3.0,-
8,84.0,57.0,Azerbaijan,Europe,35.3,-3.8,39.1,39750,+7.8,36870,...,5.3,3.5,3.2,2.8,3.5,3.0,3.7,2.5,3.2,-
9,85.0,92.0,Ghana,Sub-Saharan Africa,35.1,-0.4,35.5,63320,+0.1,63230,...,5.4,3.2,3.4,2.7,3.1,2.8,3.5,2.4,2.9,-


In [80]:
dfs2024[4].iloc[1]

Unnamed: 0                102.0
Unnamed: 1      78 1 Montenegro
Unnamed: 2                  NaN
Unnamed: 3               Europe
Unnamed: 4                 34.0
Unnamed: 5                 -2.5
Unnamed: 6                 36.5
Unnamed: 7               $3,890
Unnamed: 8                -2.5%
Unnamed: 9               $3,990
Unnamed: 10                   ¬
Nation Brand            3.1 3.3
Unnamed: 12                 5.4
Unnamed: 13                 3.4
Unnamed: 14                 3.1
Unnamed: 15                 2.9
Unnamed: 16                 3.0
Unnamed: 17                 3.0
Unnamed: 18                 3.7
Unnamed: 19                 2.6
Unnamed: 20                 3.2
Unnamed: 21                   -
Unnamed: 22                 NaN
Name: 1, dtype: object

In [81]:
dfs2024[4]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,101.0,101 0 Uzbekistan,,Asia,34.1,-0.9,35.0,"$110,270",+11.1%,"$99,290",...,3.3,3.1,2.8,3.2,2.9,3.6,2.5,3.0,-,
1,102.0,78 1 Montenegro,,Europe,34.0,-2.5,36.5,"$3,890",-2.5%,"$3,990",...,3.4,3.1,2.9,3.0,3.0,3.7,2.6,3.2,-,
2,103.0,102 1 Madagascar,,Sub-Saharan Africa,33.9,-1.0,34.9,"$11,120",-4.6%,"$11,660",...,3.3,3.2,2.7,2.8,2.7,3.8,2.4,3.2,-,
3,,,,,,,,,,,...,,,,,,,,,,
4,104.0,85 1 Rwanda,,Sub-Saharan Africa,33.9,-2.3,36.2,"$11,390",+2.9%,"$11,070",...,3.4,2.9,3.2,3.1,2.9,3.4,2.5,3.4,-,
5,105.0,91 1 Nepal,,Asia,33.8,-1.8,35.6,"$27,870",-3.6%,"$28,900",...,3.0,3.3,2.6,2.9,2.7,3.9,2.3,2.9,-,
6,106.0,- 3 Armenia,,Europe,33.5,-,-,"$17,790",-,-,...,3.2,3.3,2.6,3.0,3.0,3.4,2.6,2.8,-,
7,107.0,- 3 Brunei Darussalam,,Asia,33.3,-,-,"$6,530",-,-,...,4.1,2.7,3.3,3.4,2.8,3.4,2.5,3.1,-,
8,,,,,,,,,,,...,,,,,,,,,,
9,108.0,87 1 Côte d'Ivoire,,Sub-Saharan Africa,33.2,-2.7,35.9,"$50,840",+19.8%,"$42,440",...,3.2,3.1,2.4,2.9,2.8,3.5,2.3,2.8,-,


In [82]:
def clean_df_2024_page_4(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame with these steps:
      1. Drop Unnamed:2, Unnamed:10, Unnamed:22
      2. Strip $ from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Drop rows where Unnamed:3 is NaN
      5. Split Unnamed:1 into Rank & Country (allow '-' as a rank), strip non‑letters
      6. Hard‑code Country fixes for rows 14,18,24
      7. Split 'Nation Brand' into two numeric parts, removing letters then “2023”,
         and apply manual overrides for rows 11,12,14,24
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 2', 'Unnamed: 10', 'Unnamed: 22'], inplace=True)

    # 2. remove $ from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. drop rows with NaN in Unnamed:3
    df = df[df['Unnamed: 3'].notna()].reset_index(drop=True)

    # 5. split Unnamed:1 → Rank & Country
    orig1 = df['Unnamed: 1'].astype(str).copy()
    parts1 = orig1.str.extract(r'^\s*(\d+|-)\s+\d*\s*(.*)$', expand=True)
    rank    = parts1[0]
    country = parts1[1].str.strip().str.replace(r'[^A-Za-z\s]', '', regex=True)
    bad1 = [(i, orig1.iat[i]) for i in df.index if pd.isna(rank.iat[i]) or country.iat[i]==""]
    idx1 = df.columns.get_loc('Unnamed: 1')
    df.drop(columns=['Unnamed: 1'], inplace=True)
    df.insert(idx1,     'Rank',    rank)
    df.insert(idx1 + 1, 'Country', country)
    for i, val in bad1:
        print(f"Warning: row {i}, Unnamed:1={val!r} could not split into Rank & Country")

    # 6. hard‑coded country fixes
    for idx, name in {14: 'Ethiopia',
                      18: 'Central African Republic',
                      24: 'San Marino'}.items():
        if idx in df.index:
            df.at[idx, 'Country'] = name

    # 7. split 'Nation Brand' into two numeric parts
    orig_nb = df['Nation Brand'].astype(str).copy()
    step1   = orig_nb.str.replace(r'[A-Za-z]', '', regex=True)    # strip letters
    step2   = step1.str.replace('2023', '', regex=False)         # remove "2023"
    cleaned = step2.str.strip()
    parts_nb= cleaned.str.split(r'\s+', n=1, expand=True)
    nb1     = pd.to_numeric(parts_nb[0], errors='coerce')
    nb2     = pd.to_numeric(parts_nb[1], errors='coerce')

    # record any initial failures
    bad_nb = [i for i in df.index if pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i])]

    # manual overrides for rows 11,12,14,24
    overrides_nb = {
        11: (4.4, 3.2),
        12: (3.5, 3.3),
        14: (4.6, 3.3),
        24: (2.5, 2.7),
    }
    for i, (v1, v2) in overrides_nb.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            if i in bad_nb:
                bad_nb.remove(i)

    # insert the two new columns in place of 'Nation Brand'
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)

    # report any remaining split failures
    for i in bad_nb:
        print(f"Warning: row {i}, Nation Brand={orig_nb.iat[i]!r} did not split into two numeric parts")

    # 8. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [83]:
df_2024_page_4 = clean_df_2024_page_4(dfs2024[4])
df_2024_page_4

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,101.0,101,Uzbekistan,Asia,34.1,-0.9,35.0,110270,+11.1,99290,...,5.2,3.3,3.1,2.8,3.2,2.9,3.6,2.5,3.0,-
1,102.0,78,Montenegro,Europe,34.0,-2.5,36.5,3890,-2.5,3990,...,5.4,3.4,3.1,2.9,3.0,3.0,3.7,2.6,3.2,-
2,103.0,102,Madagascar,Sub-Saharan Africa,33.9,-1.0,34.9,11120,-4.6,11660,...,5.7,3.3,3.2,2.7,2.8,2.7,3.8,2.4,3.2,-
3,104.0,85,Rwanda,Sub-Saharan Africa,33.9,-2.3,36.2,11390,+2.9,11070,...,5.0,3.4,2.9,3.2,3.1,2.9,3.4,2.5,3.4,-
4,105.0,91,Nepal,Asia,33.8,-1.8,35.6,27870,-3.6,28900,...,5.6,3.0,3.3,2.6,2.9,2.7,3.9,2.3,2.9,-
5,106.0,-,Armenia,Europe,33.5,-,-,17790,-,-,...,5.3,3.2,3.3,2.6,3.0,3.0,3.4,2.6,2.8,-
6,107.0,-,Brunei Darussalam,Asia,33.3,-,-,6530,-,-,...,5.0,4.1,2.7,3.3,3.4,2.8,3.4,2.5,3.1,-
7,108.0,87,Cte dIvoire,Sub-Saharan Africa,33.2,-2.7,35.9,50840,+19.8,42440,...,5.2,3.2,3.1,2.4,2.9,2.8,3.5,2.3,2.8,-
8,109.0,-,North Macedonia,Europe,33.2,-,-,9910,-,-,...,5.0,3.3,3.2,2.8,3.2,2.9,3.7,2.6,3.3,-
9,110.0,109,Cameroon,Sub-Saharan Africa,33.2,-1.2,34.4,18450,-6.9,19810,...,5.3,2.9,3.1,2.3,2.9,2.7,3.3,2.2,2.7,-


In [84]:
dfs2024[5].iloc[1]

Unnamed: 0                   127.0
Unnamed: 1        121 1 Å Zimbabwe
Unnamed: 2                     NaN
Unnamed: 3      Sub-Saharan Africa
Unnamed: 4                    31.5
Unnamed: 5                    -1.2
Unnamed: 6                    32.7
Unnamed: 7                  $1,420
Unnamed: 8                   +1.4%
Unnamed: 9                  $1,400
Unnamed: 10                      Ý
Nation Brand               4.2 3.2
Unnamed: 12                    5.0
Unnamed: 13                    2.7
Unnamed: 14                    2.6
Unnamed: 15                    2.2
Unnamed: 16                    2.6
Unnamed: 17                    2.6
Unnamed: 18                    3.2
Unnamed: 19                    2.1
Unnamed: 20                    2.6
Unnamed: 21                      -
Unnamed: 22                    NaN
Name: 1, dtype: object

In [85]:
dfs2024[5]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,126.0,120 1 Ä Guatemala,,LATAM & Caribbean,31.5,-1.8,33.3,"$70,140",+17.2%,"$59,860",...,3.0,2.9,2.3,3.0,2.8,3.4,2.3,3.0,-,
1,127.0,121 1 Å Zimbabwe,,Sub-Saharan Africa,31.5,-1.2,32.7,"$1,420",+1.4%,"$1,400",...,2.7,2.6,2.2,2.6,2.6,3.2,2.1,2.6,-,
2,128.0,- 3 Æ Andorra,,Europe,31.4,-,-,"$2,610",-,-,...,4.1,2.8,3.0,2.7,2.7,3.5,2.4,3.3,-,
3,,,,,,,,,,,...,,,,,,,,,,
4,129.0,- 3 Ç Syria,,MENA,31.2,-,-,"$4,530",-,-,...,2.0,2.5,1.7,2.4,2.5,2.7,1.8,1.9,-,
5,130.0,- 3 È Dominica,,LATAM & Caribbean,31.1,-,-,$320,-,-,...,3.2,3.1,2.7,3.1,2.7,3.5,2.4,2.8,-,
6,131.0,- 3 É Liberia,,Sub-Saharan Africa,30.8,-,-,$10,-,-,...,3.0,2.6,2.6,3.0,2.8,3.1,2.4,2.9,-,
7,132.0,114 1 Ê Mozambique,,Sub-Saharan Africa,30.8,-3.1,33.9,"$14,360",+10.2%,"$13,030",...,3.0,2.7,2.4,2.6,2.7,3.4,2.2,2.8,-,
8,,,,,,,,,,,...,,,,,,,,,,
9,133.0,- 3 Ë Namibia,,Sub-Saharan Africa,30.5,-,-,"$6,670",-,-,...,2.9,2.7,2.7,2.7,2.5,3.5,2.2,2.9,-,


In [86]:
def clean_df_2024_page_5(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame from dfs2024[5] with these steps:
      1. Drop Unnamed:2, Unnamed:10, Unnamed:22
      2. Strip $ from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Drop rows where Unnamed:3 is NaN
      5. Split Unnamed:1 into Rank & Country (ignore the middle number), 
         allow '-' as Rank, then override row 18 → (74, 'Seychelles')
      6. Split 'Nation Brand' into two numeric parts:
         a) remove all letters
         b) remove any '2023'
         c) split into two floats, with manual overrides for rows 11,12,14,24
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 2', 'Unnamed: 10', 'Unnamed: 22'], inplace=True)

    # 2. strip $ from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. drop rows with NaN in Unnamed:3
    df = df[df['Unnamed: 3'].notna()].reset_index(drop=True)

    # 5. split Unnamed:1 → Rank & Country
    orig1 = df['Unnamed: 1'].astype(str).copy()
    parts1 = orig1.str.extract(r'^\s*(\d+|-)\s+\d*\s*(.*)$', expand=True)
    rank    = parts1[0]
    country = parts1[1].str.strip().str.replace(r'[^A-Za-z\s]', '', regex=True)
    bad1 = [(i, orig1.iat[i]) for i in df.index if pd.isna(rank.iat[i]) or country.iat[i]==""]
    idx1 = df.columns.get_loc('Unnamed: 1')
    df.drop(columns=['Unnamed: 1'], inplace=True)
    df.insert(idx1,     'Rank',    rank)
    df.insert(idx1 + 1, 'Country', country)

    # log any failures (except we will override row 18 next)
    bad1 = [(i, orig1.iat[i]) 
            for i in df.index 
            if pd.isna(rank.iat[i]) or not country.iat[i]]

    # override row 18
    if 18 in df.index:
        df.at[18, 'Rank']    = 74
        df.at[18, 'Country'] = 'Seychelles'
        bad1 = [b for b in bad1 if b[0] != 18]

    for i, val in bad1:
        print(f"Warning: row {i}, Unnamed:1={val!r} could not split into Rank & Country")

    # 6. split 'Nation Brand' into two numeric parts
    orig_nb = df['Nation Brand'].astype(str).copy()
    # a) remove all letters
    step1 = orig_nb.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    cleaned = step2.str.strip()
    # c) split into two pieces
    parts_nb = cleaned.str.split(r'\s+', n=1, expand=True)
    nb1 = pd.to_numeric(parts_nb[0], errors='coerce')
    nb2 = pd.to_numeric(parts_nb[1], errors='coerce')

    # record failures
    bad_nb = [
        (i, orig_nb.iat[i])
        for i, toks in cleaned.str.split().items()
        if len(toks) != 2 or pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i])
    ]

    # manual overrides
    overrides_nb = {
        11: (2.5, 2.7),
        12: (2.8, 3.1),
        14: (3.4, 3.2),
        24: (2.2, 2.7),
    }
    for i, (v1, v2) in overrides_nb.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            bad_nb = [b for b in bad_nb if b[0] != i]

    # insert Nation_1, Nation_2
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)

    for i, val in bad_nb:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into two numeric parts")


    # 8. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [87]:
df_2024_page_5 = clean_df_2024_page_5(dfs2024[5])
df_2024_page_5

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,126.0,120,Guatemala,LATAM & Caribbean,31.5,-1.8,33.3,70140,+17.2,59860,...,5.0,3.0,2.9,2.3,3.0,2.8,3.4,2.3,3.0,-
1,127.0,121,Zimbabwe,Sub-Saharan Africa,31.5,-1.2,32.7,1420,+1.4,1400,...,5.0,2.7,2.6,2.2,2.6,2.6,3.2,2.1,2.6,-
2,128.0,-,Andorra,Europe,31.4,-,-,2610,-,-,...,5.2,4.1,2.8,3.0,2.7,2.7,3.5,2.4,3.3,-
3,129.0,-,Syria,MENA,31.2,-,-,4530,-,-,...,4.2,2.0,2.5,1.7,2.4,2.5,2.7,1.8,1.9,-
4,130.0,-,Dominica,LATAM & Caribbean,31.1,-,-,320,-,-,...,5.1,3.2,3.1,2.7,3.1,2.7,3.5,2.4,2.8,-
5,131.0,-,Liberia,Sub-Saharan Africa,30.8,-,-,10,-,-,...,4.9,3.0,2.6,2.6,3.0,2.8,3.1,2.4,2.9,-
6,132.0,114,Mozambique,Sub-Saharan Africa,30.8,-3.1,33.9,14360,+10.2,13030,...,4.9,3.0,2.7,2.4,2.6,2.7,3.4,2.2,2.8,-
7,133.0,-,Namibia,Sub-Saharan Africa,30.5,-,-,6670,-,-,...,5.0,2.9,2.7,2.7,2.7,2.5,3.5,2.2,2.9,-
8,134.0,117,Laos,Asia,30.5,-3.1,33.6,9200,-23.5,12030,...,4.7,2.9,2.6,2.3,2.8,2.8,3.1,2.3,2.7,-
9,135.0,-,Fiji,Oceania,30.5,-,-,3050,-,-,...,5.3,3.2,3.1,2.5,2.7,2.6,3.8,2.3,3.0,-


In [88]:
dfs2024[6].iloc[1]

Unnamed: 0            152.0
Unnamed: 1            - 3 õ
Unnamed: 2      Afghanistan
Unnamed: 3              NaN
Unnamed: 4             Asia
Unnamed: 5             28.6
Unnamed: 6                -
Unnamed: 7                -
Unnamed: 8           $5,080
Unnamed: 9                -
Unnamed: 10               -
Unnamed: 11             NaN
Nation Brand        6.1 3.1
Unnamed: 13             3.8
Unnamed: 14             1.9
Unnamed: 15             1.8
Unnamed: 16             1.6
Unnamed: 17             2.2
Unnamed: 18             2.1
Unnamed: 19             1.9
Unnamed: 20             1.5
Unnamed: 21             1.8
Unnamed: 22               -
Unnamed: 23             NaN
Name: 1, dtype: object

In [89]:
dfs2024[6]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,151.0,- 3 ô,South Sudan,,Sub-Saharan Africa,28.6,-,-,"$4,950",-,...,2.4,2.3,2.0,2.4,2.4,2.7,1.9,2.4,-,
1,152.0,- 3 õ,Afghanistan,,Asia,28.6,-,-,"$5,080",-,...,1.9,1.8,1.6,2.2,2.1,1.9,1.5,1.8,-,
2,153.0,- 3 ö,Mauritania,,Sub-Saharan Africa,28.5,-,-,"$16,820",-,...,2.6,2.6,2.5,2.6,2.5,3.4,1.9,2.5,-,
3,,,,,,,,,,,...,,,,,,,,,,
4,154.0,118 1 ÷,Trinidad and Tobago,,LATAM & Caribbean,28.1,-5.4,33.5,"$14,600",-1.7%,...,2.9,3.0,2.2,2.6,2.4,3.3,2.0,2.6,-,
5,155.0,- 3 ø,Guyana,,LATAM & Caribbean,28.0,-,-,"$9,260",-,...,2.8,2.7,2.2,2.6,2.4,3.0,2.1,2.6,-,
6,156.0,- 3 ù,Papua New Guinea,,Oceania,27.9,-,-,"$11,660",-,...,2.7,2.5,2.1,2.4,2.5,3.1,2.1,2.7,-,
7,157.0,- 3 ú,Togo,,Sub-Saharan Africa,27.7,-,-,"$4,270",-,...,2.7,2.5,2.1,2.5,2.4,3.0,2.0,2.5,-,
8,,,,,,,,,,,...,,,,,,,,,,
9,158.0,- 3 û,Gambia,,Sub-Saharan Africa,27.7,-,-,"$1,300",-,...,2.6,2.5,2.3,2.5,2.6,3.2,1.9,2.5,-,


In [90]:
def clean_df_2024_page_6(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the DataFrame with these steps:
      1. Drop Unnamed:3, Unnamed:11, Unnamed:23
      2. Strip $ (and commas) from Unnamed:8 & Unnamed:10
      3. Strip % from Unnamed:9
      4. Drop rows where Unnamed:4 is NaN
      5. Set all Unnamed:1 to '-' except row 3 → 118
      6. Split 'Nation Brand' into two numeric parts:
         a) remove all letters
         b) remove any '2023'
         c) split into two floats
         d) replace 'Nation Brand' with the two new columns,
            warning on any that don’t cleanly split
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 3', 'Unnamed: 11', 'Unnamed: 23'], inplace=True)

    # 2. strip $ and commas from Unnamed:8 & Unnamed:10
    for c in ['Unnamed: 8', 'Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:9
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 4. drop rows missing Unnamed:4
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 5. override Unnamed:1
    df['Unnamed: 1'] = '-'
    if 3 in df.index:
        df.at[3, 'Unnamed: 1'] = '118'

    # 6. split 'Nation Brand'
    orig_nb = df['Nation Brand'].astype(str).copy()
    # a) remove all letters
    step1 = orig_nb.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) collapse whitespace
    cleaned = step2.str.strip()

    # d) split into two parts and convert
    parts = cleaned.str.split(r'\s+', n=1, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')

    # find failures
    bad = []
    for i, txt in orig_nb.items():
        toks = cleaned.iat[i].split()
        if len(toks) != 2 or pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i]):
            bad.append((i, txt))

    # manual overrides
    overrides = {
        11: (3.2, 3.9),
        12: (2.5, 2.8),
        14: (2.1, 2.5),
        24: (2.0, 2.7),
    }
    for i, (v1, v2) in overrides.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            bad = [b for b in bad if b[0] != i]

    # replace the original column in-place
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)

    # report failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into two numeric parts")

    # 7. rename to the final schema
    df.columns = [
        'Rank 2024',
        'Rank 2023',
        'Nation Brand',
        'Region',
        'Index Score 2024',
        'Index Score Change',
        'Index Score 2023',
        'Brand Value 2024 (USD mn)',
        'Brand Value Change',
        'Brand Value 2023 (USD mn)',
        'Familiarity',
        'Influence',
        'Reputation',
        'Business & Trade',
        'Culture & Heritage',
        'Governance',
        'International Relations',
        'Media & Communication',
        'People & Values',
        'Education & Science',
        'Sustainable Future',
        'Medals'
    ]

    return df

In [91]:
df_2024_page_6 = clean_df_2024_page_6(dfs2024[6])
df_2024_page_6

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,151.0,-,South Sudan,Sub-Saharan Africa,28.6,-,-,4950,-,-,...,4.4,2.4,2.3,2.0,2.4,2.4,2.7,1.9,2.4,-
1,152.0,-,Afghanistan,Asia,28.6,-,-,5080,-,-,...,3.8,1.9,1.8,1.6,2.2,2.1,1.9,1.5,1.8,-
2,153.0,-,Mauritania,Sub-Saharan Africa,28.5,-,-,16820,-,-,...,4.6,2.6,2.6,2.5,2.6,2.5,3.4,1.9,2.5,-
3,154.0,118,Trinidad and Tobago,LATAM & Caribbean,28.1,-5.4,33.5,14600,-1.7,14850,...,4.7,2.9,3.0,2.2,2.6,2.4,3.3,2.0,2.6,-
4,155.0,-,Guyana,LATAM & Caribbean,28.0,-,-,9260,-,-,...,4.6,2.8,2.7,2.2,2.6,2.4,3.0,2.1,2.6,-
5,156.0,-,Papua New Guinea,Oceania,27.9,-,-,11660,-,-,...,4.7,2.7,2.5,2.1,2.4,2.5,3.1,2.1,2.7,-
6,157.0,-,Togo,Sub-Saharan Africa,27.7,-,-,4270,-,-,...,4.4,2.7,2.5,2.1,2.5,2.4,3.0,2.0,2.5,-
7,158.0,-,Gambia,Sub-Saharan Africa,27.7,-,-,1300,-,-,...,4.5,2.6,2.5,2.3,2.5,2.6,3.2,1.9,2.5,-
8,159.0,-,Equatorial Guinea,Sub-Saharan Africa,27.6,-,-,3200,-,-,...,4.5,2.7,2.5,2.3,2.5,2.4,2.9,2.0,2.6,-
9,160.0,-,Malawi,Sub-Saharan Africa,27.5,-,-,10190,-,-,...,4.5,2.5,2.4,2.2,2.4,2.4,3.2,1.8,2.5,-


In [92]:
combined2024 = pd.concat([df_2024_page_0, df_2024_page_1, df_2024_page_2, df_2024_page_3, df_2024_page_4, df_2024_page_5, df_2024_page_6], 
                     axis=0,            # stack rows
                     ignore_index=True) # reset the row index

In [93]:
combined2024

Unnamed: 0,Rank 2024,Rank 2023,Nation Brand,Region,Index Score 2024,Index Score Change,Index Score 2023,Brand Value 2024 (USD mn),Brand Value Change,Brand Value 2023 (USD mn),...,Reputation,Business & Trade,Culture & Heritage,Governance,International Relations,Media & Communication,People & Values,Education & Science,Sustainable Future,Medals
0,1.0,1.0,USA,North America,78.8,+4.0,74.8,32271140,+6.5,30309110,...,7.3,8.9,7.4,6.1,8.7,6.6,4.4,8.4,6.1,17
1,2.0,2.0,United Kingdom,Europe,71.8,+4.5,67.3,4036790,-15.8,4796830,...,7.6,8.3,6.9,6.6,8.3,5.9,4.9,6.2,6.2,12
2,3.0,5.0,China,Asia,71.2,+6.2,65.0,19960020,-13.5,23085110,...,6.8,8.8,5.7,4.8,7.1,4.4,3.6,7.6,5.5,8
3,4.0,4.0,Japan,Asia,70.6,+5.4,65.2,4406090,-1.0,4448780,...,7.7,8.9,6.8,6.5,7.0,4.7,5.5,7.9,7.1,13
4,5.0,3.0,Germany,Europe,69.8,+4.0,65.8,4985350,-1.8,5075970,...,7.5,8.5,5.8,6.6,7.9,5.0,4.8,6.6,6.8,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,171.0,-,Haiti,LATAM & Caribbean,26.6,-,-,8550,-,-,...,4.5,2.1,2.4,1.8,2.1,2.1,2.9,1.7,2.2,-
171,172.0,-,Saint Lucia,LATAM & Caribbean,26.3,-,-,1760,-,-,...,4.1,3.0,2.6,2.3,2.5,2.4,2.9,2.2,2.5,-
172,173.0,-,Burundi,Sub-Saharan Africa,26.2,-,-,3110,-,-,...,4.1,2.6,2.2,2.1,2.5,2.4,2.8,1.8,2.4,-
173,174.0,-,São Tomé and Príncipe,Sub-Saharan Africa,26.1,-,-,400,-,-,...,4.0,2.7,2.4,2.3,2.4,2.3,2.7,1.9,2.4,-


### 2025 ###

In [94]:
# define the PDF, page and area coordinates
pdf_path = "https://static.brandirectory.com/reports/brand-finance-soft-power-index-2025-digital.pdf"
page_number = "57-64"


dfs2025 = []
area = [100,  25, 900, 1500]  
for page in range(57, 65): 
    tables = tabula.read_pdf(
        pdf_path,
        pages=page,
        area=area,
        guess=False,
        stream=True,
        multiple_tables=False
    )
    # always one table in list
    dfs2025.append(tables[0])

In [95]:
dfs2025[0].iloc[0]

Unnamed: 0                1.0
Unnamed: 1              1 0 ö
Unnamed: 2      United States
Unnamed: 3                NaN
Unnamed: 4      North America
Unnamed: 5               79.5
Unnamed: 6               +0.7
Unnamed: 7               78.8
Unnamed: 8        $37,328,559
Unnamed: 9             +15.7%
Unnamed: 10       $32,271,140
Unnamed: 11               NaN
Nation Brand    9.4  7.2  8.0
Unnamed: 13               9.2
Unnamed: 14               8.9
Unnamed: 15               9.2
Unnamed: 16               7.3
Unnamed: 17               5.8
Unnamed: 18               6.8
Unnamed: 19               6.0
Unnamed: 20               4.4
Unnamed: 21              15.0
Unnamed: 22               NaN
Unnamed: 23               NaN
Name: 0, dtype: object

In [96]:
dfs2025[0]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,1.0,1 0 ö,United States,,North America,79.5,+0.7,78.8,"$37,328,559",+15.7%,...,8.9,9.2,7.3,5.8,6.8,6.0,4.4,15.0,,
1,2.0,3 2 ÷,China,,Asia,72.8,+1.6,71.2,"$20,530,402",+2.9%,...,7.5,8.6,6.1,5.1,4.6,5.7,4.0,9.0,,
2,3.0,2 1 ø,United Kingdom,,Europe,72.4,+0.6,71.8,"$4,446,486",+10.1%,...,8.3,7.3,6.9,6.4,6.0,6.1,5.0,14.0,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,4 0 ù,Japan,,Asia,71.5,+0.9,70.6,"$4,210,121",-4.4%,...,7.0,8.7,7.0,6.5,4.6,7.2,5.7,14.0,,
5,5.0,5 0 ú,Germany,,Europe,70.1,+0.3,69.8,"$5,000,069",+0.3%,...,7.9,7.5,5.8,6.4,4.8,6.7,4.9,8.0,,
6,6.0,6 0 û,France,,Europe,68.5,+1.2,67.3,"$3,886,788",+10.3%,...,7.7,6.2,8.0,5.3,5.1,5.7,4.8,6.0,,
7,7.0,7 0 ü,Canada,,North America,65.2,+0.8,64.4,"$2,729,373",+2.2%,...,6.8,6.3,5.1,6.4,4.6,6.3,5.8,14.0,,
8,,,,,,,,,,,...,,,,,,,,,,
9,8.0,8 0 ý,Switzerland,,Europe,64.9,+2.0,62.9,"$1,143,210",+7.3%,...,6.9,6.2,5.5,6.8,4.4,6.7,5.9,19.0,,


In [97]:
def clean_df_2025_page_0(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 page-0 DataFrame:
      1. Drop Unnamed:3, Unnamed:11, Unnamed:22, Unnamed:23
      2. Strip $ & commas from Unnamed:8 & Unnamed:10
      3. Strip % from Unnamed:9
      4. Keep only the first number in Unnamed:1
      5. Drop rows where Unnamed:4 is NaN
      6. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove "2023"
         c) split into three floats
         d) apply manual overrides for rows 9,11,12,14
    """
    df = df.copy()

    # 1.
    df.drop(columns=['Unnamed: 3','Unnamed: 11','Unnamed: 22','Unnamed: 23'],
            inplace=True)

    # 2.
    for c in ['Unnamed: 8','Unnamed: 10']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3.
    df['Unnamed: 9'] = df['Unnamed: 9'].astype(str).str.rstrip('%')

    # 4.
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5.
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 6. a) strip letters
    orig = df['Nation Brand'].astype(str)
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    #    b) remove "2023"
    step2 = step1.str.replace('2023', '', regex=False)
    #    c) collapse spaces
    cleaned = step2.str.replace(r'\s+', ' ', regex=True).str.strip()

    #    d) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # 6d) initial collection of split failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]

    # 6e) apply your manual overrides
    overrides = {
        9:  (6.6, 7.1, 5.9),
        11: (7.6, 6.8, 5.1),
        12: (8.4, 7.3, 5.4),
        14: (7.3, 7.3, 4.8),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # **NEW** 6f) remove the overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]

    # 6g) replace 'Nation Brand' with the three new columns
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # 6h) now print only the *true* remaining failures
    for i, raw in bad:
        print(f"Warning: row {i}, Nation Brand={raw!r} did not split into three numeric parts")

    # 7. override Unnamed:2 at row 18
    if 18 in df.index:
        df.at[18, 'Unnamed: 2'] = 'Belgium'
        df.at[18, 'Unnamed: 1'] = 20
    
    # 8. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df

In [98]:
df_2025_page_0 = clean_df_2025_page_0(dfs2025[0])
df_2025_page_0

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,1.0,1.0,United States,North America,79.5,+0.7,78.8,37328559,+15.7,32271140,...,8.0,9.2,8.9,9.2,7.3,5.8,6.8,6.0,4.4,15.0
1,2.0,3.0,China,Asia,72.8,+1.6,71.2,20530402,+2.9,19960020,...,7.6,8.8,7.5,8.6,6.1,5.1,4.6,5.7,4.0,9.0
2,3.0,2.0,United Kingdom,Europe,72.4,+0.6,71.8,4446486,+10.1,4036790,...,6.7,8.3,8.3,7.3,6.9,6.4,6.0,6.1,5.0,14.0
3,4.0,4.0,Japan,Asia,71.5,+0.9,70.6,4210121,-4.4,4406090,...,6.1,9.2,7.0,8.7,7.0,6.5,4.6,7.2,5.7,14.0
4,5.0,5.0,Germany,Europe,70.1,+0.3,69.8,5000069,+0.3,4985350,...,6.2,8.9,7.9,7.5,5.8,6.4,4.8,6.7,4.9,8.0
5,6.0,6.0,France,Europe,68.5,+1.2,67.3,3886788,+10.3,3522360,...,6.2,8.0,7.7,6.2,8.0,5.3,5.1,5.7,4.8,6.0
6,7.0,7.0,Canada,North America,65.2,+0.8,64.4,2729373,+2.2,2670820,...,5.7,7.9,6.8,6.3,5.1,6.4,4.6,6.3,5.8,14.0
7,8.0,8.0,Switzerland,Europe,64.9,+2.0,62.9,1143210,+7.3,1065370,...,5.3,8.7,6.9,6.2,5.5,6.8,4.4,6.7,5.9,19.0
8,9.0,9.0,Italy,Europe,62.4,+0.4,62.0,2403023,+3.3,2326270,...,5.5,7.3,6.0,4.8,8.1,4.5,4.3,4.8,5.5,7.0
9,10.0,10.0,United Arab Emirates,MENA,60.4,+0.7,59.7,1223085,+15.2,1061770,...,5.9,7.8,6.3,5.3,4.6,5.4,4.2,5.2,4.6,1.0


In [99]:
dfs2025[1].iloc[0]

Unnamed: 0                 26
Unnamed: 1             25 1 +
Unnamed: 2            Türkiye
Unnamed: 3             Europe
Unnamed: 4               52.9
Unnamed: 5               -0.8
Unnamed: 6               53.7
Unnamed: 7           $455,253
Unnamed: 8              -8.3%
Unnamed: 9           $496,420
Unnamed: 10                 D
Nation Brand    7.4  6.5  5.3
Unnamed: 12               5.6
Unnamed: 13               5.3
Unnamed: 14               3.8
Unnamed: 15               5.6
Unnamed: 16               3.8
Unnamed: 17               3.7
Unnamed: 18               3.6
Unnamed: 19               4.3
Unnamed: 20               0.0
Unnamed: 21               NaN
Unnamed: 22               NaN
Name: 0, dtype: object

In [100]:
dfs2025[1]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,26,25 1 +,Türkiye,Europe,52.9,-0.8,53.7,"$455,253",-8.3%,"$496,420",...,5.3,3.8,5.6,3.8,3.7,3.6,4.3,0.0,,
1,27,"27 0 ,",Portugal,Europe,51.1,+1.0,50.1,"$295,884",+11.9%,"$264,410",...,4.6,3.8,5.4,4.1,3.7,4.3,5.1,0.0,,
2,28,28 0 -,Ireland,Europe,50.5,+0.6,49.9,"$742,827",-13.1%,"$854,400",...,4.8,4.5,4.7,4.7,3.7,5.0,5.2,0.0,,
3,29,30 2 .,Luxembourg,Europe,50.0,+1.0,49.0,"$135,448",+7.4%,"$126,120",...,5.2,4.8,4.2,5.4,3.7,5.5,4.9,0.0,,
4,30,29 1 /,India,Asia,49.8,-,49.8,"$2,770,819",-5.9%,"$2,944,480",...,4.4,4.6,5.6,2.7,3.1,2.8,3.5,1.0,,
5,31,31 0 0,Brazil,LATAM,48.8,-,48.8,"$974,485",+6.9%,"$911,210",...,4.2,3.1,5.4,2.8,3.5,3.5,4.7,2.0,,
6,32,33 2 1,Poland,Europe,48.7,+0.1,48.6,"$1,061,190",+22.9%,"$863,580",...,4.9,4.2,4.2,4.3,3.5,4.4,4.6,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,33,32 1 2,Israel,MENA,47.8,-0.9,48.7,"$409,585",-1.2%,"$414,380",...,4.9,5.1,3.2,3.2,3.7,3.3,2.9,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [101]:
def clean_df_2025_page_1(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 page‑1 DataFrame:
      1. Drop Unnamed:10, Unnamed:21, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,24
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 46
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]

    # f) manual overrides for known rows
    overrides = {
        9:  (5.8, 6.9, 3.5),
        11: (4.8, 6.8, 3.5),
        12: (7.7, 6.2, 4.3),
        24: (5.9, 6.0, 3.8),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]

    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 46
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 46.0

    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df

In [102]:
df_2025_page_1 = clean_df_2025_page_1(dfs2025[1])
df_2025_page_1

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,26.0,25.0,Türkiye,Europe,52.9,-0.8,53.7,455253,-8.3,496420,...,5.3,5.6,5.3,3.8,5.6,3.8,3.7,3.6,4.3,0.0
1,27.0,27.0,Portugal,Europe,51.1,+1.0,50.1,295884,11.9,264410,...,4.5,5.4,4.6,3.8,5.4,4.1,3.7,4.3,5.1,0.0
2,28.0,28.0,Ireland,Europe,50.5,+0.6,49.9,742827,-13.1,854400,...,4.0,6.1,4.8,4.5,4.7,4.7,3.7,5.0,5.2,0.0
3,29.0,30.0,Luxembourg,Europe,50.0,+1.0,49.0,135448,7.4,126120,...,4.1,6.6,5.2,4.8,4.2,5.4,3.7,5.5,4.9,0.0
4,30.0,29.0,India,Asia,49.8,-,49.8,2770819,-5.9,2944480,...,5.2,4.6,4.4,4.6,5.6,2.7,3.1,2.8,3.5,1.0
5,31.0,31.0,Brazil,LATAM,48.8,-,48.8,974485,6.9,911210,...,4.7,4.6,4.2,3.1,5.4,2.8,3.5,3.5,4.7,2.0
6,32.0,33.0,Poland,Europe,48.7,+0.1,48.6,1061190,22.9,863580,...,4.3,5.3,4.9,4.2,4.2,4.3,3.5,4.4,4.6,0.0
7,33.0,32.0,Israel,MENA,47.8,-0.9,48.7,409585,-1.2,414380,...,5.0,4.6,4.9,5.1,3.2,3.2,3.7,3.3,2.9,0.0
8,34.0,36.0,Greece,Europe,46.8,+1.2,45.6,140080,18.3,118440,...,4.0,4.5,4.0,3.4,6.0,3.5,3.1,3.9,4.8,2.0
9,35.0,34.0,Iceland,Europe,46.2,+0.4,45.8,33920,4.0,32600,...,3.5,5.5,4.2,4.3,4.0,5.0,3.4,5.5,5.1,0.0


In [103]:
dfs2025[2].iloc[0]

Unnamed: 0                 51
Unnamed: 1             51 0 ]
Unnamed: 2            Bahrain
Unnamed: 3               MENA
Unnamed: 4               40.4
Unnamed: 5               +0.4
Unnamed: 6               40.0
Unnamed: 7            $28,362
Unnamed: 8              +3.3%
Unnamed: 9            $27,450
Unnamed: 10                 v
Nation Brand    3.6  5.8  3.8
Unnamed: 12               4.8
Unnamed: 13               4.2
Unnamed: 14               3.3
Unnamed: 15               3.2
Unnamed: 16               3.8
Unnamed: 17               3.2
Unnamed: 18               3.7
Unnamed: 19               4.1
Unnamed: 20               0.0
Unnamed: 21               NaN
Unnamed: 22               NaN
Name: 0, dtype: object

In [104]:
dfs2025[2]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,51,51 0 ],Bahrain,MENA,40.4,+0.4,40.0,"$28,362",+3.3%,"$27,450",...,4.2,3.3,3.2,3.8,3.2,3.7,4.1,0.0,,
1,52,53 2 ̂,Vietnam,Asia,39.9,+0.3,39.6,"$519,623",+2.5%,"$507,060",...,3.3,3.0,3.7,2.8,2.9,3.0,3.9,0.0,,
2,53,52 1 _,Philippines,Asia,39.9,+0.1,39.8,"$628,149",+19.5%,"$525,620",...,3.3,2.9,3.9,2.7,3.0,3.2,4.4,0.0,,
3,54,59 2 ̀,Chile,LATAM,39.5,+0.7,38.8,"$284,641",+4.6%,"$272,150",...,3.5,3.1,3.9,3.0,3.2,3.5,4.2,0.0,,
4,55,58 2 a,Romania,Europe,39.5,+0.7,38.8,"$232,608",+7.7%,"$215,950",...,3.6,3.3,3.7,3.2,3.0,3.5,3.9,0.0,,
5,56,56 0 b,Slovenia,Europe,39.5,+0.5,39.0,"$99,161",+8.7%,"$91,220",...,3.7,3.4,3.5,3.5,3.3,4.0,4.2,0.0,,
6,57,55 1 c,Maldives,Asia,39.3,+0.1,39.2,"$3,650",-,"$3,650",...,3.1,2.8,4.0,3.0,3.2,3.6,4.7,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,58,63 2 d,Jordan,MENA,39.1,+0.6,38.5,"$30,400",+9.3%,"$27,810",...,4.0,3.1,3.5,3.4,3.2,3.2,3.9,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [105]:
def clean_df_2025_page_2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 page‑1 DataFrame:
      1. Drop Unnamed:10, Unnamed:21, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,24
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 46
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]


    # f) manual overrides for known rows
    overrides = {
        9:  (4.3, 6.1, 3.6),
        11: (6.6, 4.6, 4.3),
        12: (4.0, 6.2, 3.5),
        14: (3.4, 5.8, 3.4),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]
    
    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 75
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 75.0

    
    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [106]:
df_2025_page_2 = clean_df_2025_page_2(dfs2025[2])
df_2025_page_2

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,51.0,51.0,Bahrain,MENA,40.4,+0.4,40.0,28362,+3.3,27450,...,3.8,4.8,4.2,3.3,3.2,3.8,3.2,3.7,4.1,0.0
1,52.0,53.0,Vietnam,Asia,39.9,+0.3,39.6,519623,+2.5,507060,...,3.7,4.0,3.3,3.0,3.7,2.8,2.9,3.0,3.9,0.0
2,53.0,52.0,Philippines,Asia,39.9,+0.1,39.8,628149,+19.5,525620,...,3.6,3.7,3.3,2.9,3.9,2.7,3.0,3.2,4.4,0.0
3,54.0,59.0,Chile,LATAM,39.5,+0.7,38.8,284641,+4.6,272150,...,3.5,4.0,3.5,3.1,3.9,3.0,3.2,3.5,4.2,0.0
4,55.0,58.0,Romania,Europe,39.5,+0.7,38.8,232608,+7.7,215950,...,3.5,3.8,3.6,3.3,3.7,3.2,3.0,3.5,3.9,0.0
5,56.0,56.0,Slovenia,Europe,39.5,+0.5,39.0,99161,+8.7,91220,...,3.6,4.2,3.7,3.4,3.5,3.5,3.3,4.0,4.2,0.0
6,57.0,55.0,Maldives,Asia,39.3,+0.1,39.2,3650,-,3650,...,3.4,4.0,3.1,2.8,4.0,3.0,3.2,3.6,4.7,0.0
7,58.0,63.0,Jordan,MENA,39.1,+0.6,38.5,30400,+9.3,27810,...,3.7,3.9,4.0,3.1,3.5,3.4,3.2,3.2,3.9,0.0
8,59.0,54.0,Georgia,Europe,39.1,-0.2,39.3,26440,+11.7,23660,...,3.5,3.9,3.6,3.3,4.1,3.2,3.3,3.6,4.3,0.0
9,60.0,60.0,Slovakia,Europe,39.0,+0.2,38.8,133588,+3.9,128560,...,3.6,3.9,3.5,3.5,3.4,3.3,3.0,3.7,4.0,0.0


In [107]:
dfs2025[3].iloc[0]

Unnamed: 0                 76
Unnamed: 1               71 1
Unnamed: 2             Latvia
Unnamed: 3             Europe
Unnamed: 4               36.5
Unnamed: 5               -0.7
Unnamed: 6               37.2
Unnamed: 7            $46,092
Unnamed: 8              +6.4%
Unnamed: 9            $43,300
Unnamed: 10                 ̈
Nation Brand    3.3  5.6  3.4
Unnamed: 12               3.9
Unnamed: 13               3.6
Unnamed: 14               3.2
Unnamed: 15               3.1
Unnamed: 16               3.4
Unnamed: 17               3.0
Unnamed: 18               3.8
Unnamed: 19               3.8
Unnamed: 20               0.0
Unnamed: 21               NaN
Unnamed: 22               NaN
Name: 0, dtype: object

In [108]:
dfs2025[3]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,76,71 1,Latvia,Europe,36.5,-0.7,37.2,"$46,092",+6.4%,"$43,300",...,3.6,3.2,3.1,3.4,3.0,3.8,3.8,0.0,,
1,77,79 2,Nigeria,Sub-Saharan Africa,36.4,+0.1,36.3,"$140,900",-14.9%,"$165,640",...,3.0,2.4,3.3,2.1,3.0,2.6,3.3,0.0,,
2,78,73 1,Algeria,MENA,36.4,-0.4,36.8,"$105,935",+21.8%,"$86,940",...,3.5,2.7,3.2,2.9,2.9,3.0,3.8,0.0,,
3,79,77 1,Tunisia,MENA,36.3,-0.3,36.6,"$22,893",+21.3%,"$18,880",...,3.1,2.7,3.7,2.7,2.9,3.0,4.0,0.0,,
4,80,87 2,Belarus,Europe,36.2,+1.2,35.0,"$35,550",-19.9%,"$44,360",...,3.5,3.2,3.0,3.0,3.0,3.1,3.6,0.0,,
5,81,84 2,Azerbaijan,Europe,36.0,+0.7,35.3,"$41,410",+4.2%,"$39,750",...,3.5,2.8,3.6,3.2,2.9,3.1,4.0,0.0,,
6,82,117 2,El Salvador,LATAM,35.8,+3.2,32.6,"$13,210",+1.3%,"$13,040",...,3.7,2.9,3.1,3.8,3.4,3.2,4.0,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,83,83 0,Dominican Republic,LATAM,35.8,+0.3,35.5,"$104,930",+11.1%,"$94,410",...,3.2,2.7,3.8,2.8,3.0,2.9,4.3,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [109]:
def clean_df_2025_page_3(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 page‑1 DataFrame:
      1. Drop Unnamed:10, Unnamed:21, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,14
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 108
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]


    # f) manual overrides for known rows
    overrides = {
        9:  (2.5, 5.6, 2.8),
        11: (4.1, 5.5, 3.4),
        12: (6.2, 4.8, 3.6),
        14: (4.3, 5.4, 3.4),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
           nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]
    
    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 108
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 108.0

    
    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [110]:
df_2025_page_3 = clean_df_2025_page_3(dfs2025[3])
df_2025_page_3

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,76.0,71.0,Latvia,Europe,36.5,-0.7,37.2,46092,6.4,43300,...,3.4,3.9,3.6,3.2,3.1,3.4,3.0,3.8,3.8,0.0
1,77.0,79.0,Nigeria,Sub-Saharan Africa,36.4,+0.1,36.3,140900,-14.9,165640,...,3.8,2.9,3.0,2.4,3.3,2.1,3.0,2.6,3.3,0.0
2,78.0,73.0,Algeria,MENA,36.4,-0.4,36.8,105935,21.8,86940,...,3.5,3.2,3.5,2.7,3.2,2.9,2.9,3.0,3.8,0.0
3,79.0,77.0,Tunisia,MENA,36.3,-0.3,36.6,22893,21.3,18880,...,3.5,3.3,3.1,2.7,3.7,2.7,2.9,3.0,4.0,0.0
4,80.0,87.0,Belarus,Europe,36.2,+1.2,35.0,35550,-19.9,44360,...,3.5,3.6,3.5,3.2,3.0,3.0,3.0,3.1,3.6,0.0
5,81.0,84.0,Azerbaijan,Europe,36.0,+0.7,35.3,41410,4.2,39750,...,3.5,3.5,3.5,2.8,3.6,3.2,2.9,3.1,4.0,0.0
6,82.0,117.0,El Salvador,LATAM,35.8,+3.2,32.6,13210,1.3,13040,...,3.3,3.5,3.7,2.9,3.1,3.8,3.4,3.2,4.0,0.0
7,83.0,83.0,Dominican Republic,LATAM,35.8,+0.3,35.5,104930,11.1,94410,...,3.2,3.5,3.2,2.7,3.8,2.8,3.0,2.9,4.3,0.0
8,84.0,78.0,Paraguay,LATAM,35.5,-0.8,36.3,37620,3.7,36280,...,3.3,3.4,3.2,2.9,3.1,2.7,2.9,3.0,3.9,0.0
9,85.0,82.0,Liechtenstein,Europe,35.4,-0.1,35.5,9320,6.0,8790,...,2.8,5.1,3.5,3.3,2.8,4.2,2.7,3.9,3.8,0.0


In [111]:
dfs2025[4].iloc[0]

Unnamed: 0                101
Unnamed: 1            105 2 Á
Unnamed: 2              Nepal
Unnamed: 3               Asia
Unnamed: 4               33.5
Unnamed: 5               -0.3
Unnamed: 6               33.8
Unnamed: 7            $28,450
Unnamed: 8              +2.1%
Unnamed: 9            $27,870
Unnamed: 10                 Ú
Nation Brand    4.6  5.6  3.0
Unnamed: 12               2.8
Unnamed: 13               2.9
Unnamed: 14               2.4
Unnamed: 15               3.4
Unnamed: 16               2.6
Unnamed: 17               2.7
Unnamed: 18               2.9
Unnamed: 19               4.2
Unnamed: 20               0.0
Unnamed: 21               NaN
Unnamed: 22               NaN
Name: 0, dtype: object

In [112]:
dfs2025[4]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,101,105 2 Á,Nepal,Asia,33.5,-0.3,33.8,"$28,450",+2.1%,"$27,870",...,2.9,2.4,3.4,2.6,2.7,2.9,4.2,0.0,,
1,102,94 1 Â,Senegal,Sub-Saharan Africa,33.5,-1.1,34.6,"$17,250",+22.0%,"$14,140",...,3.1,2.4,3.2,2.6,2.7,2.8,3.8,0.0,,
2,103,100 1 Ã,Bolivia,LATAM,33.4,-0.8,34.2,"$19,790",+1.2%,"$19,560",...,3.1,2.6,3.0,2.4,2.9,2.9,3.7,0.0,,
3,104,96 1 Ä,Bangladesh,Asia,33.4,-1.2,34.6,"$404,200",-23.7%,"$529,750",...,2.8,2.3,2.6,2.2,2.6,2.5,3.1,0.0,,
4,105,97 1 Å,Mauritius,Sub-Saharan Africa,33.3,-1.0,34.3,"$14,720",+18.5%,"$12,420",...,3.0,2.8,3.4,2.8,3.0,3.2,4.0,0.0,,
5,106,103 1 Æ,Madagascar,Sub-Saharan Africa,32.8,-1.1,33.9,"$8,520",+4.5%,"$8,150",...,2.9,2.4,3.3,2.5,2.5,3.2,4.1,0.0,,
6,107,102 1 Ç,Montenegro,Europe,32.6,-1.4,34.0,"$4,600",+18.3%,"$3,890",...,3.1,2.7,3.3,2.7,2.9,3.0,3.7,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,108,122 2 È,Mongolia,Asia,32.4,0.4,32.0,"$13,630",+1.5%,"$13,430",...,2.8,2.4,3.1,2.6,2.7,2.9,3.7,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [113]:
def clean_df_2025_page_4(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 page‑1 DataFrame:
      1. Drop Unnamed:10, Unnamed:21, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,14
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 109
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]


    # f) manual overrides for known rows
    overrides = {
        9:  (4.2, 5.4, 3.2),
        11: (3.4, 5.4, 3.1),
        12: (4.3, 5.2, 3.0),
        14: (3.5, 5.2, 3.2),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
           nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]
    
    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 109
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 109.0

    
    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [114]:
df_2025_page_4 = clean_df_2025_page_4(dfs2025[4])
df_2025_page_4

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,101.0,105.0,Nepal,Asia,33.5,-0.3,33.8,28450,2.1,27870.0,...,3.0,2.8,2.9,2.4,3.4,2.6,2.7,2.9,4.2,0.0
1,102.0,94.0,Senegal,Sub-Saharan Africa,33.5,-1.1,34.6,17250,22.0,14140.0,...,3.3,2.8,3.1,2.4,3.2,2.6,2.7,2.8,3.8,0.0
2,103.0,100.0,Bolivia,LATAM,33.4,-0.8,34.2,19790,1.2,19560.0,...,3.2,2.9,3.1,2.6,3.0,2.4,2.9,2.9,3.7,0.0
3,104.0,96.0,Bangladesh,Asia,33.4,-1.2,34.6,404200,-23.7,529750.0,...,3.4,3.0,2.8,2.3,2.6,2.2,2.6,2.5,3.1,0.0
4,105.0,97.0,Mauritius,Sub-Saharan Africa,33.3,-1.0,34.3,14720,18.5,12420.0,...,3.0,3.3,3.0,2.8,3.4,2.8,3.0,3.2,4.0,0.0
5,106.0,103.0,Madagascar,Sub-Saharan Africa,32.8,-1.1,33.9,8520,4.5,8150.0,...,2.9,2.9,2.9,2.4,3.3,2.5,2.5,3.2,4.1,0.0
6,107.0,102.0,Montenegro,Europe,32.6,-1.4,34.0,4600,18.3,3890.0,...,3.0,3.1,3.1,2.7,3.3,2.7,2.9,3.0,3.7,0.0
7,108.0,122.0,Mongolia,Asia,32.4,0.4,32.0,13630,1.5,13430.0,...,2.8,2.8,2.8,2.4,3.1,2.6,2.7,2.9,3.7,0.0
8,109.0,104.0,Rwanda,Sub-Saharan Africa,32.3,-1.6,33.9,7450,-1.1,7530.0,...,3.3,2.8,3.0,2.3,2.7,2.8,2.5,3.3,3.4,0.0
9,110.0,110.0,Cameroon,Sub-Saharan Africa,32.3,-0.9,33.2,17730,10.1,16100.0,...,3.2,2.6,2.8,2.2,2.8,2.2,2.6,2.5,3.5,0.0


In [115]:
dfs2025[5].iloc[0]

Unnamed: 0                126
Unnamed: 1            118 1 ó
Unnamed: 2            Moldova
Unnamed: 3             Europe
Unnamed: 4               30.6
Unnamed: 5               -1.8
Unnamed: 6               32.4
Unnamed: 7            $13,210
Unnamed: 8              +7.4%
Unnamed: 9            $12,300
Nation Brand    2.9  5.0  2.9
Unnamed: 11               2.9
Unnamed: 12               2.9
Unnamed: 13               2.5
Unnamed: 14               2.8
Unnamed: 15               2.5
Unnamed: 16               2.7
Unnamed: 17               2.7
Unnamed: 18               3.4
Unnamed: 19               0.0
Unnamed: 20               NaN
Unnamed: 21               NaN
Name: 0, dtype: object

In [116]:
dfs2025[5]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,126,118 1 ó,Moldova,Europe,30.6,-1.8,32.4,"$13,210",+7.4%,"$12,300",...,2.9,2.5,2.8,2.5,2.7,2.7,3.4,0.0,,
1,127,129 2,Syria,MENA,30.5,-0.7,31.2,"$4,680",+3.3%,"$4,530",...,2.2,1.8,2.4,1.6,2.4,1.8,2.6,0.0,,
2,128,116 1 ô,Angola,Sub-Saharan Africa,30.4,-2.3,32.7,"$47,440",+18.2%,"$40,130",...,2.7,2.1,2.6,2.2,2.6,2.6,3.3,0.0,,
3,129,125 1 õ,San Marino,Europe,30.4,-1.1,31.5,$810,+22.7%,$660,...,2.8,2.6,3.0,3.1,2.6,3.1,3.6,0.0,,
4,130,140 2 ö,Mali,Sub-Saharan Africa,30.4,+0.4,30.0,"$6,320",-5.7%,"$6,700",...,2.6,2.1,2.7,2.2,2.6,2.4,3.6,0.0,,
5,131,120 1 ÷,Honduras,LATAM,30.3,-1.9,32.2,"$24,810",+6.1%,"$23,390",...,2.8,2.3,2.7,2.2,2.7,2.6,3.4,0.0,,
6,132,135 2 ø,Fiji,Oceania,30.2,-0.3,30.5,"$3,140",+3.0%,"$3,050",...,2.7,2.5,3.2,2.5,2.6,3.0,3.9,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,133,139 2 ù,Libya,MENA,30.1,-,30.1,"$15,950",+11.1%,"$14,360",...,2.5,2.0,2.1,1.9,2.3,2.1,2.8,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [117]:
def clean_df_2025_page_5(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 DataFrame:
      1. Drop  Unnamed:20, Unnamed:21
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,14
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 143
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 20', 'Unnamed: 21'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]


    # f) manual overrides for known rows
    overrides = {
        9:  (2.6, 5.2, 2.6),
        11: (3.3, 4.9, 3.0),
        12: (4.0, 4.8, 3.2),
        14: (2.8, 5.1, 2.6),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
           nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]
    
    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 143
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 143.0

    
    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [118]:
df_2025_page_5 = clean_df_2025_page_5(dfs2025[5])
df_2025_page_5

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,126.0,118.0,Moldova,Europe,30.6,-1.8,32.4,13210,+7.4,12300,...,2.9,2.9,2.9,2.5,2.8,2.5,2.7,2.7,3.4,0.0
1,127.0,129.0,Syria,MENA,30.5,-0.7,31.2,4680,+3.3,4530,...,3.4,1.9,2.2,1.8,2.4,1.6,2.4,1.8,2.6,0.0
2,128.0,116.0,Angola,Sub-Saharan Africa,30.4,-2.3,32.7,47440,+18.2,40130,...,3.1,2.6,2.7,2.1,2.6,2.2,2.6,2.6,3.3,0.0
3,129.0,125.0,San Marino,Europe,30.4,-1.1,31.5,810,+22.7,660,...,2.5,3.7,2.8,2.6,3.0,3.1,2.6,3.1,3.6,0.0
4,130.0,140.0,Mali,Sub-Saharan Africa,30.4,+0.4,30.0,6320,-5.7,6700,...,3.1,2.4,2.6,2.1,2.7,2.2,2.6,2.4,3.6,0.0
5,131.0,120.0,Honduras,LATAM,30.3,-1.9,32.2,24810,+6.1,23390,...,2.9,2.8,2.8,2.3,2.7,2.2,2.7,2.6,3.4,0.0
6,132.0,135.0,Fiji,Oceania,30.2,-0.3,30.5,3140,+3.0,3050,...,2.5,3.1,2.7,2.5,3.2,2.5,2.6,3.0,3.9,0.0
7,133.0,139.0,Libya,MENA,30.1,-,30.1,15950,+11.1,14360,...,3.2,2.2,2.5,2.0,2.1,1.9,2.3,2.1,2.8,0.0
8,134.0,138.0,Turkmenistan,Asia,30.1,-0.1,30.2,44520,-6.7,47720,...,3.0,2.7,2.9,2.4,2.8,2.6,2.5,2.8,3.5,0.0
9,135.0,137.0,Barbados,LATAM,30.0,-0.2,30.2,4110,-25.4,5510,...,2.6,3.2,2.8,2.3,3.3,2.5,2.5,2.7,3.8,0.0


In [119]:
dfs2025[6].iloc[0]

Unnamed: 0                      151
Unnamed: 1                  154 2 '
Unnamed: 2      Trinidad and Tobago
Unnamed: 3                    LATAM
Unnamed: 4                     27.6
Unnamed: 5                     -0.5
Unnamed: 6                     28.1
Unnamed: 7                  $14,560
Unnamed: 8                    -0.3%
Unnamed: 9                  $14,600
Unnamed: 10                       @
Nation Brand          2.5  4.7  2.5
Unnamed: 12                     2.7
Unnamed: 13                     2.5
Unnamed: 14                     2.2
Unnamed: 15                     3.1
Unnamed: 16                     2.3
Unnamed: 17                     2.4
Unnamed: 18                     2.4
Unnamed: 19                     3.5
Unnamed: 20                     0.0
Unnamed: 21                     NaN
Unnamed: 22                     NaN
Name: 0, dtype: object

In [120]:
dfs2025[6]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,151,154 2 ',Trinidad and Tobago,LATAM,27.6,-0.5,28.1,"$14,560",-0.3%,"$14,600",...,2.5,2.2,3.1,2.3,2.4,2.4,3.5,0.0,,
1,152,136 1 (,Myanmar,Asia,27.6,-2.7,30.3,"$32,850",-9.4%,"$36,260",...,2.2,1.8,2.3,1.8,2.3,2.1,2.6,0.0,,
2,153,152 1 ),Afghanistan,Asia,27.5,-1.1,28.6,"$5,570",+9.6%,"$5,080",...,2.1,1.5,1.7,1.5,1.9,1.6,1.9,0.0,,
3,154,185 2 *,Eswatini,Sub-Saharan Africa,27.4,3.3,24.1,"$1,890",+13.9%,"$1,660",...,2.5,2.3,2.6,2.6,2.5,2.5,3.3,0.0,,
4,155,150 1 +,Cape Verde,Sub-Saharan Africa,27.0,-1.6,28.6,"$1,090",+17.2%,$930,...,2.4,1.9,2.9,2.3,2.4,2.4,3.5,0.0,,
5,156,"158 2 ,",Gambia,Sub-Saharan Africa,27.0,-0.7,27.7,"$1,240",+22.8%,"$1,010",...,2.4,2.0,2.3,2.0,2.2,2.3,3.3,0.0,,
6,157,155 1 -,Guyana,LATAM,26.9,-1.1,28.0,"$9,150",-1.2%,"$9,260",...,2.5,2.1,2.5,2.1,2.4,2.4,3.2,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,158,160 2 .,Malawi,Sub-Saharan Africa,26.5,-1.0,27.5,"$6,620",+0.5%,"$6,590",...,2.4,1.8,2.3,2.1,2.3,2.5,3.2,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [121]:
def clean_df_2025_page_6(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 DataFrame:
      1. Drop Unnamed:10, Unnamed:21, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into three numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,14
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 169
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. clean & split 'Nation Brand'
    orig = df['Nation Brand'].astype(str)

    # a) remove all letters
    step1 = orig.str.replace(r'[A-Za-z]', '', regex=True)
    # b) remove any '2023'
    step2 = step1.str.replace('2023', '', regex=False)
    # c) remove anything non‑digit/dot/space
    step3 = step2.str.replace(r'[^0-9.\s]', ' ', regex=True)
    # d) collapse multiple spaces and trim
    cleaned = step3.str.replace(r'\s+', ' ', regex=True).str.strip()

    # e) split into three parts
    parts = cleaned.str.split(' ', n=2, expand=True)
    nb1 = pd.to_numeric(parts[0], errors='coerce')
    nb2 = pd.to_numeric(parts[1], errors='coerce')
    nb3 = pd.to_numeric(parts[2], errors='coerce')

    # detect initial failures
    bad = [
        (i, orig.iat[i])
        for i, toks in enumerate(cleaned.str.split())
        if len(toks) != 3
           or pd.isna(nb1.iat[i])
           or pd.isna(nb2.iat[i])
           or pd.isna(nb3.iat[i])
    ]


    # f) manual overrides for known rows
    overrides = {
        9:  (2.0, 4.2, 2.3),
        11: (2.4, 4.3, 2.6),
        12: (2.4, 4.3, 2.6),
        14: (2.5, 4.3, 2.5),
    }
    for i, (v1, v2, v3) in overrides.items():
        if i in df.index:
           nb1.iat[i], nb2.iat[i], nb3.iat[i] = v1, v2, v3

    # remove overridden rows from bad
    bad = [b for b in bad if b[0] not in overrides]
    
    # replace the original 'Nation Brand' column
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)
    df.insert(idx_nb + 2, 'Nation_3', nb3)

    # f) report any true remaining failures
    for i, val in bad:
        print(f"Warning: row {i}, Nation Brand={val!r} did not split into three numeric parts")

    # 8. override Unnamed:1 at row 18 to 169
    if 18 in df.index:
        df.at[18, 'Unnamed: 1'] = 169.0

    
    # 9. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [122]:
df_2025_page_6 = clean_df_2025_page_6(dfs2025[6])
df_2025_page_6

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,151.0,154.0,Trinidad and Tobago,LATAM,27.6,-0.5,28.1,14560,-0.3,14600,...,2.5,2.7,2.5,2.2,3.1,2.3,2.4,2.4,3.5,0.0
1,152.0,136.0,Myanmar,Asia,27.6,-2.7,30.3,32850,-9.4,36260,...,2.9,2.3,2.2,1.8,2.3,1.8,2.3,2.1,2.6,0.0
2,153.0,152.0,Afghanistan,Asia,27.5,-1.1,28.6,5570,9.6,5080,...,3.0,1.7,2.1,1.5,1.7,1.5,1.9,1.6,1.9,0.0
3,154.0,185.0,Eswatini,Sub-Saharan Africa,27.4,3.3,24.1,1890,13.9,1660,...,2.7,2.8,2.5,2.3,2.6,2.6,2.5,2.5,3.3,0.0
4,155.0,150.0,Cape Verde,Sub-Saharan Africa,27.0,-1.6,28.6,1090,17.2,930,...,2.6,2.4,2.4,1.9,2.9,2.3,2.4,2.4,3.5,0.0
5,156.0,158.0,Gambia,Sub-Saharan Africa,27.0,-0.7,27.7,1240,22.8,1010,...,2.7,2.5,2.4,2.0,2.3,2.0,2.2,2.3,3.3,0.0
6,157.0,155.0,Guyana,LATAM,26.9,-1.1,28.0,9150,-1.2,9260,...,2.6,2.5,2.5,2.1,2.5,2.1,2.4,2.4,3.2,0.0
7,158.0,160.0,Malawi,Sub-Saharan Africa,26.5,-1.0,27.5,6620,0.5,6590,...,2.7,2.4,2.4,1.8,2.3,2.1,2.3,2.5,3.2,0.0
8,159.0,156.0,Papua New Guinea,Oceania,26.3,-1.6,27.9,11470,-1.6,11660,...,2.4,2.4,2.4,2.1,2.3,2.0,2.3,2.3,3.1,0.0
9,160.0,167.0,Grenada,LATAM,26.2,-0.7,26.9,540,3.8,520,...,2.3,2.9,2.7,2.5,2.6,2.2,2.4,2.5,3.1,0.0


In [123]:
dfs2025[7].iloc[0]

Unnamed: 0                     176
Unnamed: 1                 176 0 Y
Unnamed: 2                    Chad
Unnamed: 3      Sub-Saharan Africa
Unnamed: 4                    24.3
Unnamed: 5                    -1.4
Unnamed: 6                    25.7
Unnamed: 7                  $5,010
Unnamed: 8                  +13.1%
Unnamed: 9                  $4,430
Unnamed: 10                      k
Nation Brand              2.5  4.1
Unnamed: 12                    2.5
Unnamed: 13                    2.1
Unnamed: 14                    2.3
Unnamed: 15                    1.7
Unnamed: 16                    2.0
Unnamed: 17                    1.7
Unnamed: 18                    2.1
Unnamed: 19                    2.1
Unnamed: 20                    2.8
Unnamed: 21                    0.0
Unnamed: 22                    NaN
Name: 0, dtype: object

In [124]:
dfs2025[6]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,151,154 2 ',Trinidad and Tobago,LATAM,27.6,-0.5,28.1,"$14,560",-0.3%,"$14,600",...,2.5,2.2,3.1,2.3,2.4,2.4,3.5,0.0,,
1,152,136 1 (,Myanmar,Asia,27.6,-2.7,30.3,"$32,850",-9.4%,"$36,260",...,2.2,1.8,2.3,1.8,2.3,2.1,2.6,0.0,,
2,153,152 1 ),Afghanistan,Asia,27.5,-1.1,28.6,"$5,570",+9.6%,"$5,080",...,2.1,1.5,1.7,1.5,1.9,1.6,1.9,0.0,,
3,154,185 2 *,Eswatini,Sub-Saharan Africa,27.4,3.3,24.1,"$1,890",+13.9%,"$1,660",...,2.5,2.3,2.6,2.6,2.5,2.5,3.3,0.0,,
4,155,150 1 +,Cape Verde,Sub-Saharan Africa,27.0,-1.6,28.6,"$1,090",+17.2%,$930,...,2.4,1.9,2.9,2.3,2.4,2.4,3.5,0.0,,
5,156,"158 2 ,",Gambia,Sub-Saharan Africa,27.0,-0.7,27.7,"$1,240",+22.8%,"$1,010",...,2.4,2.0,2.3,2.0,2.2,2.3,3.3,0.0,,
6,157,155 1 -,Guyana,LATAM,26.9,-1.1,28.0,"$9,150",-1.2%,"$9,260",...,2.5,2.1,2.5,2.1,2.4,2.4,3.2,0.0,,
7,,,,,,,,,,,...,,,,,,,,,,
8,158,160 2 .,Malawi,Sub-Saharan Africa,26.5,-1.0,27.5,"$6,620",+0.5%,"$6,590",...,2.4,1.8,2.3,2.1,2.3,2.5,3.2,0.0,,
9,,,,,,,,,,,...,,,,,,,,,,


In [125]:
def clean_df_2025_page_7(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the 2025 DataFrame:
      1. Drop Unnamed:10, Unnamed:22
      2. Strip $ & commas from Unnamed:7 & Unnamed:9
      3. Strip % from Unnamed:8
      4. Keep only the first number in Unnamed:1
      5. Leave only numbers in Unnamed:0
      6. Drop rows where Unnamed:4 is NaN
      7. Clean & split 'Nation Brand' into two numeric columns:
         a) remove all letters
         b) remove any '2023'
         c) strip non‑digit/dot/space
         d) collapse spaces and split into three parts
         e) apply manual overrides for rows 9,11,12,14
         f) report any true remaining failures
      8. Override Unnamed:1 at row 18 to 169
      9. Rename to the final 22‑column schema
    """
    df = df.copy()

    # 1. drop unwanted columns
    df.drop(columns=['Unnamed: 10', 'Unnamed: 22'], inplace=True)

    # 2. strip $ and commas from Unnamed:7 & Unnamed:9
    for c in ['Unnamed: 7', 'Unnamed: 9']:
        df[c] = df[c].astype(str).str.replace(r'[\$,]', '', regex=True)

    # 3. strip % from Unnamed:8
    df['Unnamed: 8'] = df['Unnamed: 8'].astype(str).str.rstrip('%')

    # 4. keep only the first numeric token in Unnamed:1
    df['Unnamed: 1'] = (
        df['Unnamed: 1']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 5. leave only numbers in Unnamed:0
    df['Unnamed: 0'] = (
        df['Unnamed: 0']
          .astype(str)
          .str.extract(r'(\d+(\.\d+)?)', expand=False)[0]
          .astype(float)
    )

    # 6. drop rows where Unnamed:4 is NaN
    df = df[df['Unnamed: 4'].notna()].reset_index(drop=True)

    # 7. split 'Nation Brand' into two numeric parts
    orig_nb = df['Nation Brand'].astype(str).copy()
    step1   = orig_nb.str.replace(r'[A-Za-z]', '', regex=True)    # strip letters
    step2   = step1.str.replace('2025', '', regex=False)         # remove "2025"
    cleaned = step2.str.strip()
    parts_nb= cleaned.str.split(r'\s+', n=1, expand=True)
    nb1     = pd.to_numeric(parts_nb[0], errors='coerce')
    nb2     = pd.to_numeric(parts_nb[1], errors='coerce')

    # record any initial failures
    bad_nb = [i for i in df.index if pd.isna(nb1.iat[i]) or pd.isna(nb2.iat[i])]

    # manual overrides for rows 9,11,12,14
    overrides_nb = {
        9: (2.2, 3.9),
        11: (1.5, 3.3),
        12: (1.4, 3.2),
        14: (1.4, 3.2),
    }
    for i, (v1, v2) in overrides_nb.items():
        if i in df.index:
            nb1.iat[i], nb2.iat[i] = v1, v2
            if i in bad_nb:
                bad_nb.remove(i)

    # insert the two new columns in place of 'Nation Brand'
    idx_nb = df.columns.get_loc('Nation Brand')
    df.drop(columns=['Nation Brand'], inplace=True)
    df.insert(idx_nb,     'Nation_1', nb1)
    df.insert(idx_nb + 1, 'Nation_2', nb2)

    # report any remaining split failures
    for i in bad_nb:
        print(f"Warning: row {i}, Nation Brand={orig_nb.iat[i]!r} did not split into two numeric parts")
    
    # 8. rename to final schema
    df.columns = [
        'Rank 2025',
        'Rank 2024',
        'Nation Brand',
        'Region',
        'Index Score 2025',
        'Index Score Change',
        'Index Score 2024',
        'Brand Value 2025 (USD mn)',
        'Brand Value Change',
        'Brand Value 2024 (USD mn)',
        'Familiarity',
        'Reputation',
        'Influence',
        'Business & Trade',
        'International Relations',
        'Education & Science',
        'Culture & Heritage',
        'Governance',
        'Media & Communication',
        'Sustainable Future',
        'People & Values',
        'Medals'
    ]

    return df


In [126]:
df_2025_page_7 = clean_df_2025_page_7(dfs2025[7])
df_2025_page_7

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,176.0,176.0,Chad,Sub-Saharan Africa,24.3,-1.4,25.7,5010,+13.1,4430,...,2.5,2.1,2.3,1.7,2.0,1.7,2.1,2.1,2.8,0.0
1,177.0,189.0,Djibouti,Sub-Saharan Africa,24.2,0.5,23.7,2290,+9.0,2100,...,2.6,2.2,2.1,1.7,2.0,1.9,1.9,2.0,2.8,0.0
2,178.0,172.0,Saint Lucia,LATAM,23.9,-2.4,26.3,1860,+5.7,1760,...,2.2,2.5,2.3,2.1,2.4,2.1,2.2,2.2,3.0,0.0
3,179.0,165.0,Solomon Islands,Oceania,23.9,-3.0,26.9,610,+3.4,590,...,2.1,2.5,2.5,2.1,2.2,2.1,2.2,2.4,2.7,0.0
4,180.0,175.0,Timor-Leste,Asia,23.5,-2.2,25.7,860,-7.5,930,...,2.3,2.2,2.3,2.0,1.9,2.0,2.2,2.2,2.6,0.0
5,181.0,179.0,Somalia,Sub-Saharan Africa,23.5,-1.7,25.2,10,-,10,...,2.4,1.5,1.7,1.4,1.5,1.3,1.8,1.5,2.2,0.0
6,182.0,181.0,Suriname,LATAM,23.3,-1.5,24.8,2400,+9.1,2200,...,2.3,2.2,2.3,2.0,2.0,1.8,2.1,2.1,2.6,0.0
7,183.0,177.0,Tonga,Oceania,23.3,-2.2,25.5,220,-4.3,230,...,2.2,2.2,2.2,1.9,2.1,2.0,2.3,2.3,2.8,0.0
8,184.0,180.0,Lesotho,Sub-Saharan Africa,23.1,-1.9,25.0,1330,+3.9,1280,...,2.3,2.1,2.2,1.8,2.2,1.9,2.0,2.1,2.9,0.0
9,185.0,184.0,Eritrea,Sub-Saharan Africa,22.8,-1.4,24.2,540,+3.8,520,...,2.4,1.8,2.1,1.7,2.0,1.7,1.9,1.9,2.7,0.0


In [127]:
combined2025 = pd.concat([df_2025_page_0, df_2025_page_1, df_2025_page_2, df_2025_page_3, df_2025_page_4, df_2025_page_5, df_2025_page_6, df_2025_page_7], 
                     axis=0,            # stack rows
                     ignore_index=True) # reset the row index

In [128]:
combined2025

Unnamed: 0,Rank 2025,Rank 2024,Nation Brand,Region,Index Score 2025,Index Score Change,Index Score 2024,Brand Value 2025 (USD mn),Brand Value Change,Brand Value 2024 (USD mn),...,Influence,Business & Trade,International Relations,Education & Science,Culture & Heritage,Governance,Media & Communication,Sustainable Future,People & Values,Medals
0,1.0,1.0,United States,North America,79.5,+0.7,78.8,37328559,+15.7,32271140,...,8.0,9.2,8.9,9.2,7.3,5.8,6.8,6.0,4.4,15.0
1,2.0,3.0,China,Asia,72.8,+1.6,71.2,20530402,+2.9,19960020,...,7.6,8.8,7.5,8.6,6.1,5.1,4.6,5.7,4.0,9.0
2,3.0,2.0,United Kingdom,Europe,72.4,+0.6,71.8,4446486,+10.1,4036790,...,6.7,8.3,8.3,7.3,6.9,6.4,6.0,6.1,5.0,14.0
3,4.0,4.0,Japan,Asia,71.5,+0.9,70.6,4210121,-4.4,4406090,...,6.1,9.2,7.0,8.7,7.0,6.5,4.6,7.2,5.7,14.0
4,5.0,5.0,Germany,Europe,70.1,+0.3,69.8,5000069,+0.3,4985350,...,6.2,8.9,7.9,7.5,5.8,6.4,4.8,6.7,4.9,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,189.0,183.0,Palau,Oceania,20.1,-4.2,24.3,140,+7.7,130,...,1.9,2.1,1.9,1.8,1.8,1.7,2.0,1.9,2.1,0.0
189,190.0,188.0,Micronesia,Oceania,20.1,-3.6,23.7,180,+5.9,170,...,1.9,2.1,2.2,1.9,1.8,1.8,2.0,1.9,2.1,0.0
190,191.0,186.0,Tuvalu,Oceania,20.0,-3.9,23.9,30,+50.0,20,...,1.8,2.2,2.3,1.9,1.9,1.9,2.1,2.0,2.2,0.0
191,192.0,191.0,Vanuatu,Oceania,19.3,-3.9,23.2,530,-15.9,630,...,1.8,2.1,2.0,1.7,1.8,1.7,1.9,1.9,2.2,0.0


### Combine all the data ###

In [129]:
temp_combined2020 = combined2020
temp_combined2021 = combined2021
temp_combined2022 = combined2022
temp_combined2023 = combined2023
temp_combined2024 = combined2024
temp_combined2025 = combined2025


In [130]:
years = [2020, 2021, 2022, 2023, 2024, 2025]
dfs = {
    "combined2020": temp_combined2020,
    "combined2021": temp_combined2021,
    "combined2022": temp_combined2022,
    "combined2023": temp_combined2023,
    "combined2024": temp_combined2024,
    "combined2025": temp_combined2025
}

# Step 1: Rename if needed
for name, df in dfs.items():
    if 'Nation Brand' in df.columns:
        df.rename(columns={'Nation Brand': 'Nation'}, inplace=True)

# Step 1.5: Trim stray whitespace
for df in dfs.values():
    df['Nation'] = df['Nation'].astype(str).str.strip()

# Lower‑case exception map
ISO_EXCEPTIONS = {
    'russia': 'Russian Federation',
    'turkey': 'Türkiye',
    'cote divoire': "Côte d’Ivoire",
    "cote d'ivoire": "Côte d’Ivoire",
    'cte divoire': "Côte d’Ivoire",
    'bosnia & herzegovina': 'Bosnia and Herzegovina',
    'congo': 'Republic of the Congo',
    'micronesia': 'Federated States of Micronesia',
    'saint vincent and the grenadines': 'Saint Vincent and the Grenadines',
}

def clean_country_name(raw: str) -> str:
    """
    1. Strip, normalize apostrophes
    2. Remove digits/symbols, collapse spaces
    3. Lowercase key for matching
    4. Unify all DR Congo forms to the requested label
    5. Unify Côte d’Ivoire variants via exceptions
    6. Fallback: title-case
    """
    s = raw.strip().replace("’", "'")
    s = re.sub(r"[^A-Za-z&'\-\s]", "", s)
    s = " ".join(s.split())
    key = s.lower()

    # 4) All variants of DR Congo → exact label
    if re.search(r"\b(democratic republic.*congo|dr congo|dem rep congo|congo democratic republic)\b", key):
        return "Congo Democratic Republic Of The"

    # 5) Côte d’Ivoire variants
    if key in ISO_EXCEPTIONS:
        return ISO_EXCEPTIONS[key]

    # 6) Fallback
    return s.title()

def safe_iso_lookup(country: str, df_name: str, idx: int) -> str:
    fixed = clean_country_name(country)
    try:
        return pycountry.countries.lookup(fixed).name
    except LookupError:
        print(f"Could not find ISO name for '{country}' (cleaned to '{fixed}') "
              f"in DF '{df_name}', row {idx}")
        return fixed

# Step 2: Clean + ISO lookup
for df_name, df in dfs.items():
    df['Nation'] = [
        safe_iso_lookup(n, df_name, i)
        for i, n in df['Nation'].items()
    ]

# Step 3: Suffix non‑Nation columns with year
suffixed = []
for (df_name, df), year in zip(dfs.items(), years):
    col_map = {col: f"{col}_{year}" for col in df.columns if col != 'Nation'}
    suffixed.append(df.rename(columns=col_map))

# Step 4: Outer merge
gspi_merged_df = reduce(
    lambda left, right: pd.merge(left, right, on='Nation', how='outer'),
    suffixed
)

# Quick check
gspi_merged_df

Could not find ISO name for 'Cote dIvoire' (cleaned to 'Côte d’Ivoire') in DF 'combined2021', row 85
Could not find ISO name for 'Dem. Rep. Congo' (cleaned to 'Congo Democratic Republic Of The') in DF 'combined2021', row 104
Could not find ISO name for 'Cote dIvoire' (cleaned to 'Côte d’Ivoire') in DF 'combined2022', row 95
Could not find ISO name for 'Dem. Rep. Congo' (cleaned to 'Congo Democratic Republic Of The') in DF 'combined2022', row 115
Could not find ISO name for 'Cote d'Ivoire' (cleaned to 'Côte d’Ivoire') in DF 'combined2023', row 86
Could not find ISO name for 'Dem. Rep. Congo' (cleaned to 'Congo Democratic Republic Of The') in DF 'combined2023', row 106
Could not find ISO name for 'Cte dIvoire' (cleaned to 'Côte d’Ivoire') in DF 'combined2024', row 107
Could not find ISO name for 'DR Congo' (cleaned to 'Congo Democratic Republic Of The') in DF 'combined2024', row 113
Could not find ISO name for 'Cape Verde' (cleaned to 'Cape Verde') in DF 'combined2024', row 149
Could not

Unnamed: 0,Rank_2020,Nation,Region_2020,Index Score_2020,Familiarity_2020,Influence_2020,Reputation_2020,Business & Trade_2020,Governance_2020,International Relations_2020,...,Influence_2025,Business & Trade_2025,International Relations_2025,Education & Science_2025,Culture & Heritage_2025,Governance_2025,Media & Communication_2025,Sustainable Future_2025,People & Values_2025,Medals_2025
0,,Afghanistan,,,,,,,,,...,3.0,1.7,2.1,1.5,1.7,1.5,1.9,1.6,1.9,0.0
1,,Albania,,,,,,,,,...,3.1,3.0,3.1,2.9,3.0,2.7,2.8,3.0,3.7,0.0
2,54.0,Algeria,Middle East & North Africa,29.0,4.2,2.8,5.5,2.2,1.7,1.9,...,3.5,3.2,3.5,2.7,3.2,2.9,2.9,3.0,3.8,0.0
3,,Andorra,,,,,,,,,...,2.5,4.1,2.9,2.7,3.0,3.2,2.8,3.3,3.8,0.0
4,,Angola,,,,,,,,,...,3.1,2.6,2.7,2.1,2.6,2.2,2.6,2.6,3.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,55.0,"Venezuela, Bolivarian Republic of",Latin America & Caribbean,28.8,5.1,3.2,4.8,1.8,1.3,1.9,...,3.6,2.6,2.6,2.2,3.0,2.0,2.5,2.3,3.3,0.0
190,50.0,Viet Nam,Asia,31.3,5.5,3.1,5.6,2.6,1.8,1.8,...,3.7,4.0,3.3,3.0,3.7,2.8,2.9,3.0,3.9,0.0
191,,Yemen,,,,,,,,,...,3.1,2.6,2.8,2.2,2.5,2.3,2.6,2.3,3.4,0.0
192,,Zambia,,,,,,,,,...,3.2,2.7,2.9,2.3,2.7,2.4,2.6,2.8,3.6,0.0


In [131]:
for val in gspi_merged_df['Nation']:
    print(val)

Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia, Plurinational State of
Bosnia and Herzegovina
Botswana
Brazil
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Cape Verde
Central African Republic
Chad
Chile
China
Colombia
Comoros
Congo
Congo Democratic Republic Of The
Costa Rica
Croatia
Cuba
Cyprus
Czechia
Côte d’Ivoire
Denmark
Djibouti
Dominica
Dominican Republic
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Fiji
Finland
France
Gabon
Gambia
Georgia
Germany
Ghana
Greece
Grenada
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hungary
Iceland
India
Indonesia
Iran, Islamic Republic of
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Kiribati
Korea, Democratic People's Republic of
Korea, Republic of
Kuwait
Kyrgyzstan
Lao People's Democratic Republic
Latvia
Lebanon
Lesotho
Lib

## Join all data ##

In [132]:
gspi_merged_df

Unnamed: 0,Rank_2020,Nation,Region_2020,Index Score_2020,Familiarity_2020,Influence_2020,Reputation_2020,Business & Trade_2020,Governance_2020,International Relations_2020,...,Influence_2025,Business & Trade_2025,International Relations_2025,Education & Science_2025,Culture & Heritage_2025,Governance_2025,Media & Communication_2025,Sustainable Future_2025,People & Values_2025,Medals_2025
0,,Afghanistan,,,,,,,,,...,3.0,1.7,2.1,1.5,1.7,1.5,1.9,1.6,1.9,0.0
1,,Albania,,,,,,,,,...,3.1,3.0,3.1,2.9,3.0,2.7,2.8,3.0,3.7,0.0
2,54.0,Algeria,Middle East & North Africa,29.0,4.2,2.8,5.5,2.2,1.7,1.9,...,3.5,3.2,3.5,2.7,3.2,2.9,2.9,3.0,3.8,0.0
3,,Andorra,,,,,,,,,...,2.5,4.1,2.9,2.7,3.0,3.2,2.8,3.3,3.8,0.0
4,,Angola,,,,,,,,,...,3.1,2.6,2.7,2.1,2.6,2.2,2.6,2.6,3.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,55.0,"Venezuela, Bolivarian Republic of",Latin America & Caribbean,28.8,5.1,3.2,4.8,1.8,1.3,1.9,...,3.6,2.6,2.6,2.2,3.0,2.0,2.5,2.3,3.3,0.0
190,50.0,Viet Nam,Asia,31.3,5.5,3.1,5.6,2.6,1.8,1.8,...,3.7,4.0,3.3,3.0,3.7,2.8,2.9,3.0,3.9,0.0
191,,Yemen,,,,,,,,,...,3.1,2.6,2.8,2.2,2.5,2.3,2.6,2.3,3.4,0.0
192,,Zambia,,,,,,,,,...,3.2,2.7,2.9,2.3,2.7,2.4,2.6,2.8,3.6,0.0


In [133]:
donations_df

Mechanism,Donor,Total Doses,Bilateral,Multilateral,Private,Through African Union,Through COVAX
1,Algeria,1700000.0,1700000.0,0.0,0.0,0.0,0.0
3,Argentina,4272000.0,4272000.0,0.0,0.0,0.0,0.0
6,Australia,40526290.0,40511890.0,0.0,0.0,0.0,14400.0
7,Austria,9701520.0,4181520.0,0.0,0.0,0.0,5520000.0
8,Azerbaijan,270000.0,270000.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
103,United Kingdom,52722250.0,5780570.0,0.0,0.0,0.0,46941680.0
104,United States,696916480.0,79988820.0,0.0,0.0,0.0,616927660.0
106,Uruguay,12000.0,12000.0,0.0,0.0,0.0,0.0
107,Uzbekistan,600000.0,600000.0,0.0,0.0,0.0,0.0


In [134]:
covid_data_df

Unnamed: 0,location,total_cases_2020-01,total_cases_2020-07,total_cases_2021-01,total_cases_2021-07,total_cases_2022-01,total_cases_2022-07,total_cases_2023-01,total_cases_2023-07,total_cases_2024-01,...,total_deaths_2020-01,total_deaths_2020-07,total_deaths_2021-01,total_deaths_2021-07,total_deaths_2022-01,total_deaths_2022-07,total_deaths_2023-01,total_deaths_2023-07,total_deaths_2024-01,total_deaths_2024-07
0,Afghanistan,0.0,36036.0,55023.0,143871.0,161666.0,185580.0,208420.0,224224.0,231310.0,...,0.0,1246.0,2400.0,6425.0,7407.0,7747.0,7866.0,7935.0,7981.0,7998.0
1,Albania,0.0,4570.0,76350.0,132828.0,255741.0,309278.0,333219.0,334090.0,334863.0,...,0.0,128.0,1358.0,2456.0,3334.0,3538.0,3596.0,3604.0,3605.0,3605.0
2,Algeria,0.0,26764.0,107122.0,160868.0,249310.0,267374.0,271369.0,271852.0,272010.0,...,0.0,1134.0,2888.0,4042.0,6555.0,6876.0,6881.0,6881.0,6881.0,6881.0
3,American Samoa,0.0,0.0,0.0,0.0,18.0,7766.0,8320.0,8341.0,8359.0,...,0.0,0.0,0.0,0.0,0.0,33.0,34.0,34.0,34.0,34.0
4,Andorra,0.0,897.0,9885.0,14498.0,35556.0,45508.0,47839.0,48015.0,48015.0,...,0.0,52.0,101.0,127.0,145.0,153.0,159.0,159.0,159.0,159.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,"Virgin Islands, British",0.0,8.0,141.0,2389.0,5875.0,7131.0,7305.0,7347.0,7427.0,...,0.0,1.0,1.0,25.0,49.0,63.0,64.0,64.0,64.0,64.0
227,Wallis and Futuna,0.0,0.0,5.0,453.0,453.0,761.0,3427.0,3550.0,3550.0,...,0.0,0.0,0.0,7.0,7.0,7.0,7.0,8.0,8.0,9.0
229,Yemen,0.0,1678.0,2124.0,7008.0,10998.0,11877.0,11945.0,11945.0,11945.0,...,0.0,475.0,616.0,1373.0,2007.0,2151.0,2159.0,2159.0,2159.0,2159.0
230,Zambia,0.0,4328.0,53352.0,191527.0,304656.0,329483.0,339743.0,349287.0,349304.0,...,0.0,140.0,745.0,3250.0,3914.0,4015.0,4041.0,4069.0,4069.0,4077.0


In [135]:
df_vdem_only_contries

Unnamed: 0,Country,Liberal democracy index_2019,Liberal democracy index_2020,Liberal democracy index_2021,Liberal democracy index_2022,Liberal democracy index_2023,Liberal democracy index_2024,Regimes of the World classification_2019,Regimes of the World classification_2020,Regimes of the World classification_2021,Regimes of the World classification_2022,Regimes of the World classification_2023,Regimes of the World classification_2024
0,Afghanistan,0.165,0.168,0.041,0.012,0.016,0.016,1.0,1.0,1.0,0.0,0.0,0.0
1,Albania,0.424,0.440,0.438,0.433,0.398,0.396,2.0,2.0,2.0,2.0,2.0,2.0
2,Algeria,0.145,0.146,0.132,0.115,0.115,0.116,1.0,1.0,1.0,1.0,1.0,1.0
3,Angola,0.162,0.160,0.153,0.154,0.157,0.164,1.0,1.0,1.0,1.0,1.0,1.0
4,Argentina,0.626,0.644,0.640,0.661,0.695,0.553,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,"Venezuela, Bolivarian Republic of",0.062,0.058,0.057,0.056,0.054,0.049,1.0,1.0,1.0,1.0,1.0,1.0
169,Viet Nam,0.119,0.108,0.110,0.109,0.113,0.124,1.0,1.0,0.0,0.0,0.0,0.0
170,Yemen,0.038,0.034,0.032,0.041,0.045,0.047,0.0,0.0,0.0,0.0,0.0,0.0
171,Zambia,0.305,0.293,0.339,0.439,0.432,0.392,1.0,1.0,1.0,2.0,2.0,2.0


In [136]:
world_bank_data

Unnamed: 0,iso3,Country,gdp_per_capita_2019,gdp_per_capita_2020,gdp_per_capita_2021,gdp_per_capita_2022,gdp_per_capita_2023,gdp_per_capita_2024,population_2020,land_area_sq_km_2020
0,ABW,Aruba,31096.2,22855.9,27200.1,30559.5,33984.8,,108587,180
2,AFG,Afghanistan,496.603,510.787,356.496,357.261,413.758,,3.9069e+007,652230
4,AGO,Angola,2189.86,1449.92,1925.87,2929.69,2309.53,2122.08,3.34511e+007,1.2467e+006
5,ALB,Albania,5460.43,5370.78,6413.28,6846.43,8575.17,10011.6,2.83785e+006,27400
6,AND,Andorra,41257.8,37361.1,42425.7,42414.1,46812.4,49303.7,77380,470
...,...,...,...,...,...,...,...,...,...,...
259,WSM,Samoa,4351.94,4099.66,3947.65,3869.47,4330.18,4898.77,211944,2780
261,YEM,Yemen,623.376,559.565,522.174,615.702,426.354,433.174,3.61349e+007,527970
262,ZAF,South Africa,6533.71,5580.6,6843.4,6523.41,6022.54,6253.37,6.05624e+007,1.21309e+006
263,ZMB,Zambia,1258.99,951.644,1127.16,1447.12,1330.73,1235.08,1.90594e+007,743390


In [137]:
for val in covid_data_df['location']:
    print(val)

Afghanistan
Albania
Algeria
American Samoa
Andorra
Angola
Anguilla
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia, Plurinational State of
Bosnia and Herzegovina
Botswana
Brazil
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cayman Islands
Central African Republic
Chad
Chile
China
Colombia
Comoros
Congo
Cook Islands
Costa Rica
Croatia
Cuba
Cyprus
Czechia
Côte d'Ivoire
Denmark
Djibouti
Dominica
Dominican Republic
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France
French Guiana
French Polynesia
Gabon
Gambia
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hungary
Iceland
India
Indonesia
Iran, Islamic Republic of
Iraq
Ireland
Isle of Man
Israel
Italy
Jamaica
Japan
Jersey
Jordan
Kazakhstan
Kenya
Ki

In [138]:
elcano_df

Unnamed: 0,COUNTRY,ENERGY_2019,PRIMARY_GOODS_2019,MANUFACTURES_2019,SERVICES_2019,INVESTMENTS_2019,TROOPS_2019,MILITARY_EQUIPMENT_2019,MIGRATIONS_2019,TOURISM_2019,...,DEV_COOPERATION_CONT_2024,CLIMATE_CONT_2024,ECONOMIC_CONT_2024,MILITARY_CONT_2024,SOFT_CONT_2024,ECONOMIC_SHARE_2024,MILITARY_SHARE_2024,SOFT_SHARE_2024,GLOBAL_SHARE_2024,GOBAL_PERCENTILE_2024
0,Afghanistan,0.015945,0.099791,0.008567,0.286874,0.009539,0.000000,0.969775,0.193565,0.000000,...,0.000000,0.134592,0.273698,0.000000,0.726301,0.000046,0.000000,0.000267,0.000092,0.026667
1,Albania,0.047098,0.068180,0.314348,1.433636,0.075642,0.448568,0.000000,0.063783,0.946100,...,0.000000,0.040548,0.479370,0.053277,0.467351,0.000330,0.000099,0.000700,0.000376,0.313333
2,Algeria,8.442406,4.704664,0.512515,1.247113,0.366926,0.006705,28.127822,0.314995,0.438859,...,0.000000,0.034594,0.363747,0.532686,0.103565,0.001720,0.006799,0.001066,0.002585,0.673333
3,Angola,6.820430,4.855170,0.053621,0.230657,0.815540,0.000000,2.424436,0.854808,0.036007,...,0.000000,0.068891,0.669548,0.173371,0.157079,0.001026,0.000717,0.000524,0.000837,0.473333
4,Argentina,0.553576,4.629735,2.182548,5.959289,5.673580,0.559388,15.903342,2.984973,1.146452,...,0.006304,0.068310,0.343527,0.169565,0.486907,0.001855,0.002471,0.005721,0.002950,0.693333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,"Venezuela, Bolivarian Republic of",6.142827,3.481291,0.985472,0.186545,3.349223,0.000000,10.913437,1.746935,0.066399,...,0.000000,0.147482,0.303723,0.334703,0.361573,0.000587,0.001746,0.001521,0.001056,0.540000
146,Viet Nam,0.858603,4.498912,39.236671,5.789978,1.433308,0.017894,4.416165,0.066829,2.559655,...,0.000000,0.079811,0.778797,0.007027,0.214174,0.006451,0.000157,0.003861,0.004526,0.773333
147,Yemen,0.061052,0.061836,0.009995,0.102181,0.089884,0.000000,0.000000,0.499105,0.071740,...,0.000000,0.051053,0.069330,0.000000,0.930669,0.000023,0.000000,0.000672,0.000181,0.140000
148,Zambia,0.021834,0.951853,0.212296,0.371746,0.234988,2.249612,0.000000,0.237497,0.177063,...,0.000000,0.138822,0.263897,0.430860,0.305241,0.000157,0.000690,0.000394,0.000324,0.266667


In [139]:
# 1. Define the things to ignore everywhere
ignore = {'Curaçao', 'Hong Kong', 'Macao', 'Virgin Islands, U.S.', 'Palestine, State of'}

# 2. Define your canonical name mapping
name_map = {
    "Côte d Ivoire": "Côte d'Ivoire",
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Cape Verde": "Cabo Verde",
    "Congo, The Democratic Republic of the": "Congo Democratic Republic Of The",
    "Congo Democratic Republic Of The": "Congo Democratic Republic Of The",
    "So Tom And Prncipe": "Sao Tome and Principe",
    "Trkiye": "Türkiye",
    "East Timor": "Timor-Leste",
    "Taiwan": "Taiwan, Province of China",
}

# 3. Build and normalize your master table
master = (
    covid_data_df
    .loc[~covid_data_df['location'].isin(ignore)]
    .copy()
)
master['location'] = master['location'].replace(name_map)

# 4. Guarantee Timor‑Leste, Taiwan and Congo exist in master
for extra in ("Timor-Leste", "Taiwan, Province of China", "Congo Democratic Republic Of The"):
    if extra not in master['location'].values:
        master = pd.concat([master, pd.DataFrame({'location': [extra]})], ignore_index=True)

# 5. List of secondary tables with their join‑keys
to_merge = [
    ("gspi",        gspi_merged_df,        "Nation"),
    ("donations",          donations_df,          "Donor"),
    ("vdem", df_vdem_only_contries, "Country"),
    ("world_bank_data",       world_bank_data,       "Country"),
    ("elcano",       elcano_df,       "COUNTRY"),
]

# 6. Filter, normalize and left‑join each table onto master
merged = master.copy()
master_keys = set(master['location'])

for name, df, key in to_merge:
    # a) drop ignored locales
    df = df.loc[~df[key].isin(ignore)].copy()
    # b) normalize names
    df[key] = df[key].replace(name_map)
    # c) report any still‑missing keys
    missing = sorted(set(df[key].dropna()) - master_keys)
    if missing:
        print(f"[NOTE] {name!r} has {len(missing)} '{key}' value(s) not in master: {missing}")
    # d) perform the left‑merge
    merged = pd.merge(
        merged,
        df,
        how="left",
        left_on="location",
        right_on=key,
        suffixes=(None, f"_{name}")
    )

# 7. Drop redundant key columns from the right‑hand tables
for _, _, key in to_merge:
    if key in merged.columns:
        merged = merged.drop(columns=[key])

# 8. Rename 'location' to 'Country'
merged = merged.rename(columns={'location': 'Country'})

# 9. Inspect and/or save
print("Merged shape:", merged.shape)
print(merged.head())

# (Optional) Save to CSV
#merged.to_csv("covid_master_merged.csv", index=False)

[NOTE] 'elcano' has 2 'COUNTRY' value(s) not in master: ['Czech Republic', 'United States of America']
Merged shape: (227, 463)
          Country  total_cases_2020-01  total_cases_2020-07  \
0     Afghanistan                  0.0              36036.0   
1         Albania                  0.0               4570.0   
2         Algeria                  0.0              26764.0   
3  American Samoa                  0.0                  0.0   
4         Andorra                  0.0                897.0   

   total_cases_2021-01  total_cases_2021-07  total_cases_2022-01  \
0              55023.0             143871.0             161666.0   
1              76350.0             132828.0             255741.0   
2             107122.0             160868.0             249310.0   
3                  0.0                  0.0                 18.0   
4               9885.0              14498.0              35556.0   

   total_cases_2022-07  total_cases_2023-01  total_cases_2023-07  \
0             18

In [140]:
cols_to_drop = [
    'Country_world_bank_data',
    'Rank 2020_2021', 'Index Score 2020_2021',
    'Rank 2021_2022', 'Index Score 2021_2022',
    'Rank 2022_2023', 'Index Score 2022_2023',
    'Rank 2023_2024', 'Index Score 2023_2024',
    'Rank 2024_2025', 'Index Score 2024_2025',
    'Private',
]
merged = merged.drop(columns=cols_to_drop, errors='ignore')

# 2. rename “Rank 2021_2021” → “Rank 2021” and “Index Score 2025_2025” → “Index Score 2025”
pattern = re.compile(r'^(Rank|Index Score)\s+(\d{4})_(\d{4})$')
rename_map = {}
for col in merged.columns:
    m = pattern.match(col)
    if m and m.group(2) == m.group(3):
        # use group(1) = metric name, group(2) = year
        rename_map[col] = f"{m.group(1)} {m.group(2)}"

merged = merged.rename(columns=rename_map)

# sanity‐check
print("Renamed columns:")
print(rename_map)

Renamed columns:
{'Rank 2021_2021': 'Rank 2021', 'Index Score 2021_2021': 'Index Score 2021', 'Rank 2022_2022': 'Rank 2022', 'Index Score 2022_2022': 'Index Score 2022', 'Rank 2023_2023': 'Rank 2023', 'Index Score 2023_2023': 'Index Score 2023', 'Rank 2024_2024': 'Rank 2024', 'Index Score 2024_2024': 'Index Score 2024', 'Rank 2025_2025': 'Rank 2025', 'Index Score 2025_2025': 'Index Score 2025'}


In [141]:
merged.to_csv("merged.csv")

In [142]:
# ——— CLEANUP STEPS ———

# 1) Rename Region_2020 → Region and drop all other Region_* cols
if 'Region_2020' in merged.columns:
    merged.rename(columns={'Region_2020': 'Region'}, inplace=True)
to_drop = [c for c in merged.columns if c.startswith('Region_') and c != 'Region']
merged.drop(columns=to_drop, inplace=True)

# 2) Drop 'value_2024' (if present)
merged.drop(columns=['value_2024'], errors='ignore', inplace=True)

# 3) Cast to float64 everywhere except Region, iso3, Country
keep = {'Region', 'iso3', 'Country'}
for c in merged.columns:
    if c not in keep:
        # turn any dash into missing
        merged[c] = merged[c].replace('-', pd.NA)
        # coerce to numeric
        merged[c] = pd.to_numeric(merged[c], errors='coerce')

# ——— DESCRIPTIVE SUMMARY + EXPORT ———

# select numeric only
num = merged.select_dtypes(include='number')

# build the table
summary = pd.DataFrame({
    'N':       num.count(),
    'Missing': num.isna().sum(),
    'Mean':    num.mean(),
    'Std':     num.std(),
    'Min':     num.min(),
    '25%':     num.quantile(0.25),
    '50%':     num.quantile(0.50),
    '75%':     num.quantile(0.75),
    'Max':     num.max(),
}).round(3)

# export HTML
html_file = 'final_data_summary.html'
html = summary.to_html(index=True, border=0, classes=['table', 'table-striped'])
html = html.replace(
    '<table',
    '<table><caption>Final Data Descriptive Statistics</caption>',
    1
)
with open(html_file, 'w', encoding='utf-8') as f:
    f.write(html)
print(f"Wrote HTML to {html_file}")

# export LaTeX
latex_file = 'final_data_summary.tex'
latex_body = summary.to_latex(index=True, longtable=False, float_format="%.3f")
latex_full = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{Final Data Descriptive Statistics}\n"
    "\\label{tab:final_data_summary}\n"
    f"{latex_body}\n"
    "\\end{table}"
)
with open(latex_file, 'w', encoding='utf-8') as f:
    f.write(latex_full)
print(f"Wrote LaTeX to {latex_file}")

# preview HTML inline
display(HTML(html))

Wrote HTML to final_data_summary.html
Wrote LaTeX to final_data_summary.tex


Unnamed: 0,N,Missing,Mean,Std,Min,25%,50%,75%,Max
total_cases_2020-01,224,3,9.246,132.623,0.0,0.0,0.0,0.0,1985.0
total_cases_2020-07,224,3,145968.496,818061.561,0.0,260.25,2827.5,30132.5,8687607.0
total_cases_2021-01,224,3,928757.326,5108126.24,0.0,1952.5,24585.5,213783.75,54123948.0
total_cases_2021-07,224,3,1628325.143,8222515.296,0.0,6063.0,70215.0,424718.5,82979733.0
total_cases_2022-01,224,3,3269137.33,17162443.446,0.0,16694.25,148625.0,913243.5,199795181.0
total_cases_2022-07,224,3,4983086.451,26631118.842,0.0,27803.5,210686.0,1305500.25,342549520.0
total_cases_2023-01,224,3,6172666.558,32100553.459,0.0,29349.75,223304.5,1465032.25,413027842.0
total_cases_2023-07,224,3,6315235.911,32949996.377,0.0,29613.0,227289.0,1483868.25,424868998.0
total_cases_2024-01,224,3,6364927.594,33239676.246,0.0,29689.0,230832.0,1500675.25,429650833.0
total_cases_2024-07,224,3,6375033.371,33286692.45,0.0,31174.5,232098.5,1500876.5,430368190.0


## Start Data anylsis ##

In [2]:
vaccine_donation_power_measurement = pd.read_csv(
    'merged.csv',
    index_col=0      # use first column as index
)

In [3]:
vaccine_donation_power_measurement

Unnamed: 0,Country,total_cases_2020-01,total_cases_2020-07,total_cases_2021-01,total_cases_2021-07,total_cases_2022-01,total_cases_2022-07,total_cases_2023-01,total_cases_2023-07,total_cases_2024-01,...,DEV_COOPERATION_CONT_2024,CLIMATE_CONT_2024,ECONOMIC_CONT_2024,MILITARY_CONT_2024,SOFT_CONT_2024,ECONOMIC_SHARE_2024,MILITARY_SHARE_2024,SOFT_SHARE_2024,GLOBAL_SHARE_2024,GOBAL_PERCENTILE_2024
0,Afghanistan,0.0,36036.0,55023.0,143871.0,161666.0,185580.0,208420.0,224224.0,231310.0,...,0.0,0.134592,0.273698,0.000000,0.726301,0.000046,0.000000,0.000267,0.000092,0.026667
1,Albania,0.0,4570.0,76350.0,132828.0,255741.0,309278.0,333219.0,334090.0,334863.0,...,0.0,0.040548,0.479370,0.053277,0.467351,0.000330,0.000099,0.000700,0.000376,0.313333
2,Algeria,0.0,26764.0,107122.0,160868.0,249310.0,267374.0,271369.0,271852.0,272010.0,...,0.0,0.034594,0.363747,0.532686,0.103565,0.001720,0.006799,0.001066,0.002585,0.673333
3,American Samoa,0.0,0.0,0.0,0.0,18.0,7766.0,8320.0,8341.0,8359.0,...,,,,,,,,,,
4,Andorra,0.0,897.0,9885.0,14498.0,35556.0,45508.0,47839.0,48015.0,48015.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Zambia,0.0,4328.0,53352.0,191527.0,304656.0,329483.0,339743.0,349287.0,349304.0,...,0.0,0.138822,0.263897,0.430860,0.305241,0.000157,0.000690,0.000394,0.000324,0.266667
223,Zimbabwe,0.0,2434.0,33271.0,97277.0,229415.0,256378.0,262302.0,265693.0,266288.0,...,0.0,0.237614,0.325493,0.017003,0.657503,0.000106,0.000015,0.000467,0.000178,0.126667
224,Timor-Leste,,,,,,,,,,...,,,,,,,,,,
225,"Taiwan, Province of China",,,,,,,,,,...,,,,,,,,,,


In [4]:
print(vaccine_donation_power_measurement.columns.tolist())


['Country', 'total_cases_2020-01', 'total_cases_2020-07', 'total_cases_2021-01', 'total_cases_2021-07', 'total_cases_2022-01', 'total_cases_2022-07', 'total_cases_2023-01', 'total_cases_2023-07', 'total_cases_2024-01', 'total_cases_2024-07', 'total_deaths_2020-01', 'total_deaths_2020-07', 'total_deaths_2021-01', 'total_deaths_2021-07', 'total_deaths_2022-01', 'total_deaths_2022-07', 'total_deaths_2023-01', 'total_deaths_2023-07', 'total_deaths_2024-01', 'total_deaths_2024-07', 'Rank_2020', 'Region_2020', 'Index Score_2020', 'Familiarity_2020', 'Influence_2020', 'Reputation_2020', 'Business & Trade_2020', 'Governance_2020', 'International Relations_2020', 'Culture & Heritage_2020', 'Media & Communication_2020', 'Education & Science_2020', 'People & Values_2020', 'Medals - General Public_2020', 'Medals - Specialist Audiences_2020', 'Rank 2021', 'Region_2021', 'Index Score 2021', 'Index Score Change_2021', 'Familiarity_2021', 'Reputation_2021', 'Influence_2021', 'Business & Trade_2021', '

# Reg 1: Baisc #
We estimated the following cross‐sectional OLS model, for each country $i$:
$$
\begin{aligned}
\Delta \mathrm{SoftPower}_i 
&= \mathrm{SoftPower}_{i,2024} \;-\; \mathrm{SoftPower}_{i,2020},\\[6pt]
\Delta \mathrm{SoftPower}_i 
&= \alpha 
  \;+\; \beta \,\ln\!\bigl(1 + \mathrm{TotalDoses}_i\bigr)
  \;+\; \varepsilon_i.
\end{aligned}
$$


Process for **Principal component analysis (PAC)** composite score:


1. **Data vector**
   $\displaystyle
     \mathbf{x}_{i,t}   = \bigl(\,\text{IndexScore}_{i,t},\;\ln(1+\text{SOFT}_{i,t-1_{[t=2020]}})\bigr)^\top
   $

2. **First principal component**
   $\displaystyle
     \mathbf{v} = (v_1,v_2)^\top   = \arg\max_{\|\mathbf{v}\|=1}\mathrm{Var}(X\mathbf{v})
   $

3. **PCA composite**
   $\displaystyle
     \text{CompPCA}_{i,t} = v_1\,\text{IndexScore}_{i,t} + v_2\,\ln(1+\text{SOFT}_{i,t-1_{[t=2020]}}).
   $

4. **Five‐year change**
   $\displaystyle
     \Delta\text{CompPCA}_i   = \text{CompPCA}_{i,2024} - \text{CompPCA}_{i,2020}.
   $


Process to build the **Z-score average**:

1. **Log‐transform** the SoftScore to tame its skew:

   $$
     \text{LogSoft}_i = \ln\bigl(1 + \text{SoftScore}_i\bigr).
   $$

2. **Compute z-scores** for each component (mean = 0, SD = 1):

   $$
     z_{\text{Index},i}
     = \frac{\text{IndexScore}_i - \overline{\text{IndexScore}}}
            {\mathrm{SD}(\text{IndexScore})},
     \qquad
     z_{\text{LogSoft},i}
     = \frac{\text{LogSoft}_i - \overline{\text{LogSoft}}}
            {\mathrm{SD}(\text{LogSoft})}.
   $$

3. **Average those two z-scores** to get the composite:

   $$
       \text{Composite}_{i}
       = \frac{1}{2}\,\Bigl(z_{\text{Index},i} + z_{\text{LogSoft},i}\Bigr).
   $$


In [5]:
# ── 1) Five-year changes ─────────────────────────────────────────────────────
vdp = vaccine_donation_power_measurement.copy()

# Endpoint deltas
vdp['soft_delta']  = vdp['SOFT_2024']        - vdp['SOFT_2019']
vdp['index_delta'] = vdp['Index Score 2024'] - vdp['Index Score_2020']

# Log-transform Soft at endpoints (for PCA component)
vdp['LogSoft_2019'] = np.log1p(vdp['SOFT_2019'])
vdp['LogSoft_2024'] = np.log1p(vdp['SOFT_2024'])

# Doses (log1p so zero donors map to 0)
vdp['log_doses'] = np.log1p(vdp['Total Doses'])

# ── 2) PCA composite at endpoints (raw-scale PCA; sklearn centers only) ─────
# Note: Not standardizing means Index scale can dominate the PCA by design.
df_end = vdp[['Index Score_2020','LogSoft_2019','Index Score 2024','LogSoft_2024','Total Doses']].dropna().copy()

# Pool endpoints with common column names
df_2020 = df_end[['Index Score_2020','LogSoft_2019']].rename(
    columns={'Index Score_2020':'Index','LogSoft_2019':'LogSoft'}
)
df_2024 = df_end[['Index Score 2024','LogSoft_2024']].rename(
    columns={'Index Score 2024':'Index','LogSoft_2024':'LogSoft'}
)
pca_data = pd.concat([df_2020, df_2024], ignore_index=True)

# Fit PCA (centers, no scaling)
pca = PCA(n_components=1, random_state=0).fit(pca_data[['Index','LogSoft']])
comp_vec = pca.components_[0]         # (2,)
mean_vec = pca.mean_                  # (2,)

# Fix sign so loading on Index is positive
if comp_vec[0] < 0:
    comp_vec = -comp_vec

# Project endpoints manually: (X - mean) @ comp_vec
X20 = df_2020[['Index','LogSoft']].values - mean_vec
X24 = df_2024[['Index','LogSoft']].values - mean_vec
df_end['comp_pca_2020']  = X20 @ comp_vec
df_end['comp_pca_2024']  = X24 @ comp_vec
df_end['comp_pca_delta'] = df_end['comp_pca_2024'] - df_end['comp_pca_2020']

# Merge PCA delta back to vdp (index alignment)
vdp = vdp.join(df_end['comp_pca_delta'], how='left')

# ── 3) Regressions ───────────────────────────────────────────────────────────
df2 = vdp[['log_doses','index_delta','soft_delta','comp_pca_delta']].dropna()

X = sm.add_constant(df2['log_doses'])
m1 = sm.OLS(df2['index_delta'],    X).fit()  # ΔIndex    ~ log_doses
m2 = sm.OLS(df2['soft_delta'],     X).fit()  # ΔSOFT     ~ log_doses
m3 = sm.OLS(df2['comp_pca_delta'], X).fit()  # ΔComp_PCA ~ log_doses

# ── 4) Show with Stargazer ───────────────────────────────────────────────────
stargazer = Stargazer([m1, m2, m3])
stargazer.title("Cross–Sectional OLS (2019→2024): Doses vs. ΔIndex, ΔSOFT, ΔComp_PCA")
stargazer.custom_columns(
    ["ΔIndex (log doses)", "ΔSOFT (log doses)", "ΔComp_PCA (log doses)"],
    [1, 1, 1]
)
stargazer.rename_covariates({'log_doses': 'log(Total Doses + 1)', 'const': 'Constant'})
stargazer.significant_digits(3)

display(HTML("<h3>Cross–Sectional OLS (2019→2024): Results</h3>"))
display(HTML(stargazer.render_html()))


0,1,2,3
,,,
,,,
,ΔIndex (log doses),ΔSOFT (log doses),ΔComp_PCA (log doses)
,(1),(2),(3)
,,,
Constant,4.362*,-49.259**,4.351*
,(2.356),(18.865),(2.352)
log(Total Doses + 1),0.329**,3.679***,0.328**
,(0.154),(1.230),(0.153)
Observations,48,48,48


# Reg 2: Disaggregated Models by Channel of Donation

$$
Y_{it} \;=\; 
\beta_{0}
\;+\;\beta_{1}\,\mathrm{Bilateral}_{it}^{\mathrm{pc}}
\;+\;\beta_{2}\,\mathrm{COVAX}_{it}^{\mathrm{pc}}
\;+\;\beta_{3}\,\mathrm{Multilateral}_{it}^{\mathrm{pc}}
\;+\;\beta_{4}\,\mathrm{AU}_{it}^{\mathrm{pc}}
\;+\;\beta_{5}\,\ln\bigl(\mathrm{Cases}_{it}\bigr)
\;+\;\beta_{6}\,\mathrm{GDP}_{it}^{\mathrm{pc}}
\;+\;\beta_{7}\,\mathrm{COVIDResp}_{it}
\;+\;\alpha_{i}
\;+\;\gamma_{t}
\;+\;\varepsilon_{it}\,.
$$

In [6]:
# ── 1) Prepare DataFrame for panel 2020–2024 ─────────────────────────────────
vdp = vaccine_donation_power_measurement.copy()
vdp['bilateral_pc']    = vdp['Bilateral']             / vdp['population_2020']
vdp['covax_pc']        = vdp['Through COVAX']         / vdp['population_2020']
vdp['multilateral_pc'] = vdp['Multilateral']          / vdp['population_2020']
vdp['au_pc']           = vdp['Through African Union'] / vdp['population_2020']

years  = [2020, 2021, 2022, 2023, 2024]
panels = []
for yr in years:
    df_y = vdp[['Country', 'bilateral_pc', 'covax_pc', 'multilateral_pc', 'au_pc']].copy()
    df_y['year']       = yr
    df_y['IndexScore'] = vdp['Index Score_2020'] if yr == 2020 else vdp[f'Index Score {yr}']
    df_y['Soft']       = vdp['SOFT_2019']        if yr == 2020 else vdp[f'SOFT_{yr}']
    df_y['LogSoft']    = np.log1p(df_y['Soft'])
    df_y['ln_cases']   = np.log1p(vdp[f'total_cases_{yr}-01'])
    df_y['gdp_pc']     = vdp.get(f'gdp_per_capita_{yr}', np.nan)
    df_y['covid_resp'] = vdp.get(f'COVID-19 Response_{yr}', np.nan)
    panels.append(df_y)

panel_df = pd.concat(panels, ignore_index=True)

# ── 2) PCA composite only (fit on endpoints; center; sign-fix on Index) ─────
endpts = panel_df.loc[panel_df['year'].isin([2020, 2024]), ['IndexScore','LogSoft']].dropna()
pca    = PCA(n_components=1, random_state=0).fit(endpts)

# Compute scores for all rows with complete data; keep NaN otherwise
panel_df['Comp_PCA'] = np.nan
mask = panel_df[['IndexScore','LogSoft']].notna().all(axis=1)
scores = pca.transform(panel_df.loc[mask, ['IndexScore','LogSoft']])[:, 0]
# Ensure the loading on Index is positive
if pca.components_[0, 0] < 0:
    scores = -scores
panel_df.loc[mask, 'Comp_PCA'] = scores

# ── 3) Run FE regressions for 3 outcomes (Index, Soft, PCA) ─────────────────
formula = (
    '{dep} ~ bilateral_pc + covax_pc + multilateral_pc + au_pc '
    '+ ln_cases + gdp_pc + covid_resp '
    '+ C(Country) + C(year)'
)

fe_models = []
for dep in ['IndexScore', 'Soft', 'Comp_PCA']:
    fe_models.append(smf.ols(formula.format(dep=dep), data=panel_df).fit())

# ── 4) Summarize with Stargazer ─────────────────────────────────────────────
model_names = ["IndexScore FE", "Soft FE", "Comp_PCA FE"]
stargazer = Stargazer(fe_models)
stargazer.custom_columns(model_names, [1, 1, 1])
stargazer.show_model_numbers(False)
stargazer.title("Disaggregated FE by Donation Channel (2020–2024)")
stargazer.rename_covariates({
    'Intercept':       'Constant',
    'bilateral_pc':    'Bilateral PC',
    'covax_pc':        'COVAX PC',
    'multilateral_pc': 'Multilateral PC',
    'au_pc':           'African Union PC',
    'ln_cases':        'ln(Cases)',
    'gdp_pc':          'GDP per Capita',
    'covid_resp':      'COVID-19 Response'
})
stargazer.add_line("Country FE", ["Yes"]*3)
stargazer.add_line("Year FE",    ["Yes"]*3)
stargazer.covariate_order([
    'Intercept',
    'bilateral_pc',
    'covax_pc',
    'multilateral_pc',
    'au_pc',
    'ln_cases',
    'gdp_pc',
    'covid_resp'
])
stargazer.significant_digits(3)
stargazer.significance_levels([0.1, 0.05, 0.01])

html_table = stargazer.render_html()
display(HTML("<h3>Disaggregated FE by Donation Channel (2020–2024)</h3>"))
display(HTML(html_table))

0,1,2,3
,,,
,,,
,IndexScore FE,Soft FE,Comp_PCA FE
,,,
Constant,17.036***,21.150**,-7.589***
,(1.669),(8.001),(1.681)
Bilateral PC,-1.440*,-9.516**,-1.598*
,(0.738),(3.944),(0.828)
COVAX PC,7.879***,34.125***,6.668***
,(1.757),(8.409),(1.767)


# Reg 3: with coverities #
$$
\Delta\mathrm{SoftPower}_i
= \alpha + \beta\,\ln(1+\mathrm{TotalDoses}_i)
+ \gamma_1\,\mathrm{libdem}_{i,2019}
+ \gamma_2\,\mathrm{gdp\_pc}_{i,2019}
+ \gamma_3\,\mathrm{case\_rate}_{i,2021}
+ \varepsilon_i
$$

In [17]:
vdp = vaccine_donation_power_measurement.copy()

# --- Endpoint changes (Index & SOFT) -----------------------------------------
vdp['soft_delta']  = vdp['SOFT_2024'] - vdp['SOFT_2019']
vdp['index_delta'] = vdp['Index Score 2024'] - vdp['Index Score_2020']

# --- PCA composite built from endpoints (no z-scores) ------------------------
vdp['LogSoft_2019'] = np.log1p(vdp['SOFT_2019'])
vdp['LogSoft_2024'] = np.log1p(vdp['SOFT_2024'])

mask = vdp[['Index Score_2020','LogSoft_2019','Index Score 2024','LogSoft_2024']].notna().all(axis=1)
pca_df = pd.concat([
    vdp.loc[mask, ['Index Score_2020','LogSoft_2019']].rename(
        columns={'Index Score_2020': 'Index', 'LogSoft_2019': 'LogSoft'}
    ),
    vdp.loc[mask, ['Index Score 2024','LogSoft_2024']].rename(
        columns={'Index Score 2024': 'Index', 'LogSoft_2024': 'LogSoft'}
    )
], ignore_index=True)

pca = PCA(n_components=1).fit(pca_df)
w_idx, w_ls = pca.components_[0]

vdp.loc[mask, 'comp_pca_2020'] = w_idx * vdp.loc[mask, 'Index Score_2020'] + w_ls * vdp.loc[mask, 'LogSoft_2019']
vdp.loc[mask, 'comp_pca_2024'] = w_idx * vdp.loc[mask, 'Index Score 2024'] + w_ls * vdp.loc[mask, 'LogSoft_2024']
vdp['comp_pca_delta'] = vdp['comp_pca_2024'] - vdp['comp_pca_2020']

# --- Controls & key regressor ------------------------------------------------
vdp['log_doses']      = np.log1p(vdp['Total Doses'])
vdp['libdem_2019']    = vdp['Liberal democracy index_2019']
vdp['gdp_pc_2019']    = vdp['gdp_per_capita_2019']
vdp['case_rate_2021'] = vdp['total_cases_2021-01'] / vdp['population_2020']

# --- Analysis dataset (keep Country for diagnostics) -------------------------
cols = [
    'index_delta','soft_delta','comp_pca_delta',
    'log_doses','libdem_2019','gdp_pc_2019','case_rate_2021'
]
df = vdp.dropna(subset=cols)[['Country'] + cols].copy()

# --- Mean-centering & interaction --------------------------------------------
df['log_doses_c']           = df['log_doses']   - df['log_doses'].mean()
df['libdem_2019_c']         = df['libdem_2019'] - df['libdem_2019'].mean()
df['logdoses_x_libdem_c']   = df['log_doses_c'] * df['libdem_2019_c']

# --- Regressions with HC3 robust SEs -----------------------------------------
dvs = ['index_delta','soft_delta','comp_pca_delta']
dv_names = ['ΔIndex','ΔSOFT','ΔComp_PCA']
models = []

for dv in dvs:
    X = sm.add_constant(
        df[['log_doses_c','libdem_2019_c','gdp_pc_2019','case_rate_2021','logdoses_x_libdem_c']]
    )
    model = sm.OLS(df[dv], X).fit(cov_type='HC3')  # robust SEs
    models.append(model)

# --- Stargazer table (robust by construction) --------------------------------
stargazer = Stargazer(models)
stargazer.title("Extended OLS (HC3) with Centered Interaction: ΔSoft Power vs. log Doses, Democracy & Controls")
stargazer.custom_columns(dv_names, [1, 1, 1])
stargazer.rename_covariates({
    'const':                 'Constant',
    'log_doses_c':           'log(Total Doses + 1) (centered)',
    'libdem_2019_c':         'Liberal Democracy (2019, centered)',
    'logdoses_x_libdem_c':   'Interaction: log(Doses) × Liberal Democracy',
    'gdp_pc_2019':           'GDP per Capita (2019)',
    'case_rate_2021':        'Case Rate (Jan 2021)'
})
stargazer.covariate_order([
    'const',
    'log_doses_c',
    'libdem_2019_c',
    'logdoses_x_libdem_c',
    'gdp_pc_2019',
    'case_rate_2021'
])
stargazer.significant_digits(3)
display(HTML(stargazer.render_html()))

0,1,2,3
,,,
,,,
,ΔIndex,ΔSOFT,ΔComp_PCA
,(1),(2),(3)
,,,
Constant,8.076***,8.772,8.056***
,(0.866),(6.068),(0.864)
log(Total Doses + 1) (centered),0.507***,4.363,0.506***
,(0.127),(3.162),(0.126)
"Liberal Democracy (2019, centered)",-5.146***,-27.905,-5.139***


# Reg bonus (appendix B) : Country Fixed‐Effects Panel Regression

\begin{equation}
Y_{it}
= \beta_{0}
+ \beta_{1}\,\ln(\mathrm{Cases}_{it})
+ \beta_{2}\,\mathrm{GDPpc}_{it}
+ \beta_{3}\,\mathrm{COVIDResp}_{it}
+ \beta_{3}\,\mathrm{DonationsPC}_{it}
+ \alpha_{i}
+ \gamma_{t}
+ \varepsilon_{it},
\end{equation}

where

* $\alpha_{i}$ are country fixed‐effects,
* $\gamma_{t}$ are year fixed‐effects,
* and $\varepsilon_{it}$ is the idiosyncratic error term.

In [7]:
# ── 0) Copy data ─────────────────────────────────────────────────────────────
vdp = vaccine_donation_power_measurement.copy()

# ── 1) Pivot to long format ──────────────────────────────────────────────────
years = [2020, 2021, 2022, 2023, 2024]
panels = []
for yr in years:
    df = vdp[['Country']].copy()
    df['year']       = yr
    df['IndexScore'] = vdp['Index Score_2020'] if yr == 2020 else vdp[f'Index Score {yr}']
    df['Soft']       = vdp['SOFT_2019']        if yr == 2020 else vdp[f'SOFT_{yr}']
    df['LogSoft']    = np.log1p(df['Soft'])
    df['dose_pc']    = vdp['Total Doses'] / vdp['population_2020']      # donations
    df['ln_cases']   = np.log1p(vdp[f'total_cases_{yr}-01'])
    df['gdp_pc']     = vdp.get(f'gdp_per_capita_{yr}', np.nan)
    df['covid_resp'] = vdp.get(f'COVID-19 Response_{yr}', np.nan)
    panels.append(df)

panel_df = pd.concat(panels, ignore_index=True)

# ── 2) (removed) z-score composite ───────────────────────────────────────────
# [deleted: zIndex, zLogSoft, Comp_z]

# ── 3) PCA composite ─────────────────────────────────────────────────────────
endpts = panel_df[panel_df['year'].isin([2020, 2024])][['IndexScore','LogSoft']].dropna()
pca   = PCA(n_components=1).fit(endpts)
w_idx, w_ls = pca.components_[0]
panel_df['Comp_PCA'] = w_idx*panel_df['IndexScore'] + w_ls*panel_df['LogSoft']

# ── 4) Fixed‐effects regressions INCLUDING donations ─────────────────────────
formula_base = (
    '{dep} ~ ln_cases + dose_pc + gdp_pc + covid_resp '
    '+ C(Country) + C(year)'
)

fe_models   = []
model_names = ["IndexScore FE","Soft FE","Comp_PCA FE"]
for dep in ['IndexScore','Soft','Comp_PCA']:
    fe_models.append(smf.ols(formula_base.format(dep=dep), data=panel_df).fit())

# ── 5) Summarize with Stargazer ─────────────────────────────────────────────
stargazer = Stargazer(fe_models)
stargazer.custom_columns(model_names, [1,1,1])
stargazer.show_model_numbers(False)

stargazer.title("Country & Time Fixed-Effects (with Donations)")
stargazer.rename_covariates({
    'Intercept':  'Constant',
    'ln_cases':   'ln(Cases)',
    'dose_pc':    'Vax Donations per Capita',
    'gdp_pc':     'GDP per Capita',
    'covid_resp': 'COVID-19 Response'
})
stargazer.add_line("Country FE", ["Yes"]*3)
stargazer.add_line("Year FE",    ["Yes"]*3)
stargazer.covariate_order([
    'Intercept',
    'ln_cases',
    'dose_pc',
    'gdp_pc',
    'covid_resp'
])
stargazer.significant_digits(3)
stargazer.significance_levels([0.1, 0.05, 0.01])

html = stargazer.render_html()
display(HTML("<h3>Country & Time Fixed-Effects (with Donations)</h3>"))
display(HTML(html))


0,1,2,3
,,,
,,,
,IndexScore FE,Soft FE,Comp_PCA FE
,,,
Constant,16.684***,19.801**,18.386***
,(1.642),(7.888),(1.657)
ln(Cases),-0.168,-0.921,-0.179
,(0.156),(0.716),(0.150)
Vax Donations per Capita,5.716***,23.210***,5.233***
,(1.529),(7.421),(1.559)
