In [57]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import pycountry
import plotly.express as px

In [58]:
# Obtener la página
url = "https://data.worldobesity.org/tables/prevalence-of-adult-overweight-obesity-2/?regionid=-1&incomegroupid=-1&msr=msr&breakdown=c"
page = requests.get(url).text
soup = bs(page)

In [59]:
# Saber cuantas tablas tiene la página
tables = soup.find_all('table')
len(tables)

1

In [60]:
# Buscar la que tiene los datos
tables[0]

<table class="results" id="results"><thead><tr><th class="sorttable_sorted">Country</th><th class="nosort">Income group</th><th class="">Year</th><th class="">Area</th><th class="">Age</th><th class="">Males<br/>Overweight<br/>(BMI 25-29.9kg/m²)</th><th class="">Males<br/>Obesity<br/>(BMI ≥30kg/m²)</th><th class="">Females<br/>Overweight<br/>(BMI 25-29.9kg/m²)</th><th class="">Females<br/>Obesity<br/>(BMI ≥30kg/m²)</th><th class="">All adults<br/>Overweight<br/>(BMI 25-29.9kg/m²)</th><th class="">All adults<br/>Obesity<br/>(BMI ≥30kg/m²)</th></tr></thead><tbody><tr><td><span class="flag" style="background-image:url('gfx/flags-iso/AF.svg');"></span>Afghanistan</td><td>Low income</td><td>2018</td><td>National</td><td class="agebox">18-69</td><td>25.90</td><td>11.90</td><td>25.60</td><td>23.50</td><td>25.80</td><td>17.00</td></tr><tr class="selfreport"><td><span class="flag" style="background-image:url('gfx/flags-iso/AL.svg');"></span>Albania</td><td>Upper-middle income</td><td>2022</td><

In [61]:
# Convertir a una DataFrame
df = pd.DataFrame(columns = [
        'Country',
        'Income grouop',
        'Year',
        'Area',
        'Age',
        'Males_Overweight',
        'Males_Obesity',
        'Females_Overweight',
        'Females_Obesity',
        'All_adults_Overweight',
        'All_adults_Obesity'
    ])

for row in tables[0].find_all('tr')[1::]:
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    new_row = pd.DataFrame(
        {
            'Country': cols[0],
            'Income grouop': cols[1],
            'Year': cols[2],
            'Area': cols[3],
            'Age': cols[4],
            'Males_Overweight': cols[5],
            'Males_Obesity': cols[6],
            'Females_Overweight': cols[7],
            'Females_Obesity': cols[8],
            'All_adults_Overweight': cols[9],
            'All_adults_Obesity': cols[10]
        },
        index=['country']
    )
    df = pd.concat([df, new_row], ignore_index=True)

In [62]:
# Códigos estándar de países y manejo de nulls

df.iloc[:, :4] = df.iloc[:, :4].replace('', np.nan)
df.iloc[:, 5:] = df.iloc[:, 5:].replace('-', np.nan)

cols = [
    'Males_Overweight',
    'Males_Obesity',
    'Females_Overweight',
    'Females_Obesity',
    'All_adults_Overweight',
    'All_adults_Obesity'
]
for col in cols:
    df[col] = df[col].astype(float)

def country_to_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None
    
df["iso_alpha"] = df["Country"].apply(country_to_iso3)

df

Unnamed: 0,Country,Income grouop,Year,Area,Age,Males_Overweight,Males_Obesity,Females_Overweight,Females_Obesity,All_adults_Overweight,All_adults_Obesity,iso_alpha
0,Afghanistan,Low income,2018,National,18-69,25.9,11.9,25.6,23.5,25.8,17.0,AFG
1,Albania,Upper-middle income,2022,National,16+,51.0,11.3,37.5,13.4,44.2,12.4,ALB
2,Algeria,Upper-middle income,2016-2017,National,18-69,34.2,14.1,33.2,30.1,33.8,21.8,DZA
3,American Samoa,High income,2017-2018,National,18+,,77.1,,82.7,,80.2,ASM
4,Andorra,High income,2017-2018,National,18-75,42.5,13.0,27.0,14.4,35.2,13.6,AND
...,...,...,...,...,...,...,...,...,...,...,...,...
243,Wales,,2022-2023,National,16+,40.2,25.0,30.1,27.2,35.2,26.0,
244,Wallis and Futuna,,2019,National,18-69,23.8,66.7,16.6,73.7,20.0,70.4,WLF
245,Yemen,Low income,2013,National,15-49,,,15.7,8.0,,,YEM
246,Zambia,Lower-middle income,2017,National,18-69,13.2,3.0,20.2,12.3,16.7,7.5,ZMB


In [63]:
# Guardar en un CSV
df.to_csv("datos.csv", index=False)

# Guardar en Json
df.to_json("datos.json", orient="records", force_ascii=False, indent=2)

In [64]:
# Ejmplo usando plotly

fig = px.choropleth(
    df,
    locations="iso_alpha",
    locationmode="ISO-3",
    color="All_adults_Obesity",
    color_continuous_scale="blues",
    title="Mapa de calor mundial"
)
fig.show()