# 10/30/2024

In [1]:
import pandas as pd
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", None)

## Define reusable functions

In [2]:
FIRST = "First Quartile (<=25%)"
SECOND = "Second Quartile (25-50%)"
THIRD = "Third Quartile (50-75%)"
FOURTH = "Fourth Quartile (>75%)"

def assign_quartile(value, x, y, z):
    if value <= x:
        quartile = FIRST
    elif value > x and value <= y:
        quartile = SECOND
    elif value > y and value <= z:
        quartile = THIRD
    else:
        quartile = FOURTH
    return quartile

## Load and Explore Data

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")

print(df.shape)
df.sample(3)

(4123, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
3849,2015,Uganda,864.180059,61.086,37477356.0,UGA,Sub-Saharan Africa,Low income,IDA
3501,2009,St. Vincent and the Grenadines,6503.097772,75.083,109840.0,VCT,Latin America & Caribbean,Upper middle income,Blend
1664,2015,Hungary,12717.038597,75.568293,9843028.0,HUN,Europe & Central Asia,High income,Not classified


In [4]:
column_names_dict = {
    "GDP per capita (current US$)" : "GDP per Capita",
    "Life expectancy at birth, total (years)" : "Life Expectancy",
    "Population, total" : "Population"
}

df = df.rename(columns=column_names_dict)
df.sample(3)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
938,2011,Cuba,6106.006792,77.76,11298710.0,CUB,Latin America & Caribbean,Upper middle income,Not classified
3399,2021,South Sudan,,54.975,10748272.0,SSD,Sub-Saharan Africa,Low income,IDA
1673,2005,Iceland,56794.850159,81.502439,296734.0,ISL,Europe & Central Asia,High income,Not classified


In [5]:
df.sample(3).T

Unnamed: 0,651,1390,1918
Year,2009,2007,2022
Country,Cameroon,Germany,Kazakhstan
GDP per Capita,1445.860249,41640.08087,11492.031939
Life Expectancy,56.102,79.534146,
Population,19319274.0,82266372.0,19621972.0
Country Code,CMR,DEU,KAZ
Region,Sub-Saharan Africa,Europe & Central Asia,Europe & Central Asia
Income Group,Lower middle income,High income,Upper middle income
Lending Type,Blend,Not classified,IBRD


## Homework Question 3 (bonus, will cover next week)

step 1 - create a column "Life Expectancy Quartile" similar to the GDP per Capita Quartile above.
Step 2 - Create a column "health and wealth status" based on the following definition
- rich and healthy (1st GDP per capita quartile and 1st life expentancy quartile)
- poor and unhealthy ((4th GDP per capita quartile and 4th life expentancy quartile)
- rich and unhealthy (1st GDP per capita quartile and 4th life expentancy quartile)
- poor and healthy ((4th GDP per capita quartile and 1st life expentancy quartile)
- Other

In [6]:
print(df.shape)
df.sample(5)

(4123, 9)


Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
3741,2021,Trinidad and Tobago,16056.302041,72.971,1525663.0,TTO,Latin America & Caribbean,High income,IBRD
2549,2007,Mozambique,508.044896,52.566,21280513.0,MOZ,Sub-Saharan Africa,Low income,IDA
1610,2018,Haiti,1489.578406,64.019,11012421.0,HTI,Latin America & Caribbean,Lower middle income,IDA
563,2016,Bulgaria,7570.931655,74.812195,7127822.0,BGR,Europe & Central Asia,Upper middle income,IBRD
3910,2019,United Kingdom,42662.535374,81.404878,66836327.0,GBR,Europe & Central Asia,High income,Not classified


In [7]:
df_2020 = df[df["Year"] == 2020]
print(df_2020.shape)
df_2020.sample(5)

(217, 9)


Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
282,2020,Bahrain,23433.187236,79.174,1477469.0,BHR,Middle East & North Africa,High income,Not classified
111,2020,Angola,1450.905112,62.261,33428486.0,AGO,Sub-Saharan Africa,Lower middle income,IBRD
2182,2020,Liechtenstein,165287.186767,81.658537,38756.0,LIE,Europe & Central Asia,High income,Not classified
3645,2020,Tanzania,1104.164429,66.408,61704518.0,TZA,Sub-Saharan Africa,Lower middle income,IDA
814,2020,Colombia,5304.289129,74.769,50930662.0,COL,Latin America & Caribbean,Upper middle income,IBRD


In [8]:
df_2020 = df_2020.dropna(subset=["GDP per Capita", "Life Expectancy"])
df_2020.shape

(202, 9)

In [9]:
# Step 1: Creates 'Life Expectancy Quartile' column and lists the quartile the row falls under.
df_2020['Life Expectancy'].describe()



Unnamed: 0,Life Expectancy
count,202.0
mean,72.314599
std,7.451962
min,52.777
25%,66.77975
50%,72.8715
75%,77.98175
max,85.497561


In [10]:
x = df_2020['Life Expectancy'].describe()['25%']
y = df_2020['Life Expectancy'].describe()['50%']
z = df_2020['Life Expectancy'].describe()['75%']

In [11]:
df_2020['Life Expectancy Quartile'] = df_2020['Life Expectancy'].apply(assign_quartile, args=(x, y, z,))
df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,Life Expectancy Quartile
2619,2020,Nauru,10124.700622,63.437,12315.0,NRU,East Asia & Pacific,High income,IBRD,First Quartile (<=25%)
1080,2020,Dominican Republic,7167.914974,72.889,10999664.0,DOM,Latin America & Caribbean,Upper middle income,IBRD,Third Quartile (50-75%)
681,2020,Canada,43349.677856,81.670488,38007166.0,CAN,North America,High income,Not classified,Fourth Quartile (>75%)
947,2020,Cuba,9499.572504,77.567,11300698.0,CUB,Latin America & Caribbean,Upper middle income,Not classified,Third Quartile (50-75%)
54,2020,Algeria,3354.153164,74.453,43451666.0,DZA,Middle East & North Africa,Lower middle income,IBRD,Third Quartile (50-75%)


In [12]:
x = df_2020['GDP per Capita'].describe()['25%']
y = df_2020['GDP per Capita'].describe()['50%']
z = df_2020['GDP per Capita'].describe()['75%']

print(x, y, z)

2188.047693164725 5920.260418960404 20761.21119943435


In [13]:
# Step 2: Creates 'GDP Quartile' column and lists the quartile the row falls under.
# Next, creates 'Health and Wealth Status' column which categorizes a row if
# a country and a year falls under (4) different categories.


df_2020['GDP Quartile'] =  df_2020['GDP per Capita'].apply(assign_quartile, args=(x, y, z,))
df_2020.sample(10)


Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,Life Expectancy Quartile,GDP Quartile
3436,2020,Sri Lanka,3852.389091,76.393,21919000.0,LKA,South Asia,Lower middle income,IDA,Third Quartile (50-75%),Second Quartile (25-50%)
1688,2020,Iceland,58848.418124,83.063415,366463.0,ISL,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%)
453,2020,Bolivia,3068.812555,64.467,11936160.0,BOL,Latin America & Caribbean,Lower middle income,IBRD,First Quartile (<=25%),Second Quartile (25-50%)
1023,2020,Denmark,60836.592412,81.602439,5831404.0,DNK,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%)
111,2020,Angola,1450.905112,62.261,33428490.0,AGO,Sub-Saharan Africa,Lower middle income,IBRD,First Quartile (<=25%),First Quartile (<=25%)
2562,2020,Mozambique,456.581929,61.172,31178240.0,MOZ,Sub-Saharan Africa,Low income,IDA,First Quartile (<=25%),First Quartile (<=25%)
2220,2020,Luxembourg,116905.370397,82.143902,630419.0,LUX,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%)
1707,2020,India,1913.219733,70.15,1396387000.0,IND,South Asia,Lower middle income,IBRD,Second Quartile (25-50%),First Quartile (<=25%)
3531,2020,Sudan,608.33252,65.614,44440490.0,SDN,Sub-Saharan Africa,Low income,IDA,First Quartile (<=25%),First Quartile (<=25%)
3265,2020,Singapore,61273.991659,84.465854,5685807.0,SGP,East Asia & Pacific,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%)


In [14]:
def find_status(row):
  if (row['Life Expectancy Quartile'] == FIRST) and (row['GDP Quartile'] == FIRST):
    return 'Poor and unhealthy'
  elif (row['Life Expectancy Quartile'] == FOURTH) and (row['GDP Quartile'] == FIRST):
    return 'Poor but healthy'
  elif (row['Life Expectancy Quartile'] == FOURTH) and (row['GDP Quartile'] == FOURTH):
    return 'Rich and healthy'
  elif (row['Life Expectancy Quartile'] == THIRD) and (row['GDP Quartile'] == FOURTH):
    return 'Rich and somewhat healthy'
  elif (row['Life Expectancy Quartile'] == SECOND) and (row['GDP Quartile'] == FOURTH):
    return 'Rich and somewhat unhealthy'
  elif (row['Life Expectancy Quartile'] == FIRST) and (row['GDP Quartile'] == FOURTH):
    return 'Rich but unhealthy'
  else:
    return 'Other'

df_2020['Health and Wealth Status'] = df_2020.apply(find_status,axis=1)
df_2020.sample(10)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,Life Expectancy Quartile,GDP Quartile,Health and Wealth Status
1878,2020,Japan,39986.928629,84.56,126261000.0,JPN,East Asia & Pacific,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy
1365,2020,"Gambia, The",704.030463,62.612,2573995.0,GMB,Sub-Saharan Africa,Low income,IDA,First Quartile (<=25%),First Quartile (<=25%),Poor and unhealthy
1042,2020,Djibouti,2921.738706,62.694,1090156.0,DJI,Middle East & North Africa,Lower middle income,IDA,First Quartile (<=25%),Second Quartile (25-50%),Other
548,2020,Brunei Darussalam,27179.352887,74.795,441725.0,BRN,East Asia & Pacific,High income,Not classified,Third Quartile (50-75%),Fourth Quartile (>75%),Rich and somewhat healthy
1935,2020,Kenya,1936.250755,62.675,51985780.0,KEN,Sub-Saharan Africa,Lower middle income,Blend,First Quartile (<=25%),First Quartile (<=25%),Poor and unhealthy
1061,2020,Dominica,7003.469891,73.649,71995.0,DMA,Latin America & Caribbean,Upper middle income,Blend,Third Quartile (50-75%),Third Quartile (50-75%),Other
3531,2020,Sudan,608.33252,65.614,44440486.0,SDN,Sub-Saharan Africa,Low income,IDA,First Quartile (<=25%),First Quartile (<=25%),Poor and unhealthy
491,2020,Botswana,5875.070606,65.647,2546402.0,BWA,Sub-Saharan Africa,Upper middle income,IBRD,First Quartile (<=25%),Second Quartile (25-50%),Other
1574,2020,Guinea-Bissau,710.258133,59.999,2015828.0,GNB,Sub-Saharan Africa,Low income,IDA,First Quartile (<=25%),First Quartile (<=25%),Poor and unhealthy
3189,2020,Senegal,1492.475903,68.006,16436120.0,SEN,Sub-Saharan Africa,Lower middle income,IDA,Second Quartile (25-50%),First Quartile (<=25%),Other


In [15]:
df_2020["Health and Wealth Status"].value_counts()

Unnamed: 0_level_0,count
Health and Wealth Status,Unnamed: 1_level_1
Other,113
Rich and healthy,42
Poor and unhealthy,38
Rich and somewhat healthy,7
Rich and somewhat unhealthy,2


# Countries that belong to each status category.

In [16]:
df_rich_healthy = df_2020[df_2020["Health and Wealth Status"] == "Rich and healthy"]
print(df_rich_healthy.shape)
df_rich_healthy.sample(5)

(42, 12)


Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,Life Expectancy Quartile,GDP Quartile,Health and Wealth Status
2239,2020,"Macao SAR, China",37474.734595,85.184,676283.0,MAC,East Asia & Pacific,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy
358,2020,Belgium,45609.003494,80.695122,11538604.0,BEL,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy
3911,2020,United Kingdom,40217.009012,80.35122,67081234.0,GBR,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy
3322,2020,Slovenia,25558.429054,80.531707,2102419.0,SVN,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy
3569,2020,Sweden,52837.903978,82.356098,10353442.0,SWE,Europe & Central Asia,High income,Not classified,Fourth Quartile (>75%),Fourth Quartile (>75%),Rich and healthy


In [17]:
df_rich_healthy["Country"].unique()

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Bermuda', 'Canada',
       'Channel Islands', 'Cyprus', 'Czechia', 'Denmark', 'Estonia',
       'Faroe Islands', 'Finland', 'France', 'Germany',
       'Hong Kong SAR, China', 'Iceland', 'Ireland', 'Isle of Man',
       'Israel', 'Italy', 'Japan', 'Korea, Rep.', 'Liechtenstein',
       'Luxembourg', 'Macao SAR, China', 'Malta', 'Netherlands',
       'New Caledonia', 'New Zealand', 'Norway', 'Portugal',
       'Puerto Rico', 'Qatar', 'Singapore', 'Slovenia', 'Spain', 'Sweden',
       'Switzerland', 'United Arab Emirates', 'United Kingdom',
       'Virgin Islands (U.S.)'], dtype=object)

In [18]:
df_poor_unhealthy = df_2020[df_2020["Health and Wealth Status"] == "Poor and unhealthy"]
print(df_poor_unhealthy.shape)
df_poor_unhealthy["Country"].unique()

(38, 12)


array(['Afghanistan', 'Angola', 'Benin', 'Burkina Faso', 'Burundi',
       'Cameroon', 'Central African Republic', 'Chad', 'Comoros',
       'Congo, Dem. Rep.', 'Congo, Rep.', 'Ethiopia', 'Gambia, The',
       'Ghana', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Kenya', 'Lesotho',
       'Liberia', 'Madagascar', 'Malawi', 'Mali', 'Mauritania',
       'Mozambique', 'Niger', 'Nigeria', 'Pakistan', 'Rwanda',
       'Sierra Leone', 'Somalia', 'Sudan', 'Tanzania', 'Togo', 'Uganda',
       'Yemen, Rep.', 'Zambia', 'Zimbabwe'], dtype=object)

In [19]:
df_poor_healthy = df_2020[df_2020["Health and Wealth Status"] == "Poor but healthy"]
print(df_poor_healthy.shape)
df_poor_healthy["Country"].unique()

(0, 12)


array([], dtype=object)

In [20]:
df_rich_unhealthy = df_2020[df_2020["Health and Wealth Status"] == "Rich but unhealthy"]
print(df_rich_unhealthy.shape)
df_rich_unhealthy["Country"].unique()

(0, 12)


array([], dtype=object)

In [21]:
df_rich_unhealthy

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,Life Expectancy Quartile,GDP Quartile,Health and Wealth Status


In [22]:
df_other = df_2020[df_2020["Health and Wealth Status"] == "Other"]
df_other["Country"].nunique()

113

# Choropleth - Chorom (colorful), pleth -map, place.

Plot population of countries over a map using plotly

In [23]:
# Create the plot
fig = px.choropleth(
    df_2020,
    locations="Country",
    locationmode='country names',
    color="Health and Wealth Status",
    hover_name="Country",
#    color_continuous_scale=px.colors.sequential.Inferno
)

# Show the plot
fig.show()

## Home work Question 1:

Save the Plotly interactive chart to a HTML file.

Make sure specify the option to reduce the size of the file.

In [24]:
import plotly.io as pio

# Create your plotly figure (assuming `fig` is your chart)
fig = px.choropleth(
    df_2020,
    locations="Country",
    locationmode='country names',
    color="Health and Wealth Status",
    hover_name="Country",
    title="Health and Wealth Status of Countries in 2020"
)

# Show the plot
fig.show()

# Save the chart to an HTML file with reduced file size
pio.write_html(fig, file="health_wealth_status_chart.html", include_plotlyjs='cdn')


# Home work question 2:

Display country names on the Choropleth. TO make the map not messy, try on a dataframe that has only limited number of countries. For example, filter your dataset by only G7 and BRICS countries.

In [25]:
import plotly.express as px

# List of G7 and BRICS countries
g7_brics_countries = ["United States", "Canada", "France", "Germany", "Italy", "Japan", "United Kingdom",
                      "Brazil", "Russia", "India", "China", "South Africa"]

# Filter the dataframe to only include G7 and BRICS countries
df_g7_brics = df_2020[df_2020["Country"].isin(g7_brics_countries)]

# Create the choropleth map without using `text`
fig = px.choropleth(
    df_g7_brics,
    locations="Country",
    locationmode='country names',
    color="Health and Wealth Status",
    hover_name="Country",
    title="Health and Wealth Status of G7 and BRICS Countries in 2020"
)

# Add annotations for country names to avoid using `text`
for idx, row in df_g7_brics.iterrows():
    fig.add_annotation(
        x=row['Country'],  # x position of the country
        y=row['Health and Wealth Status'],  # y position of the status
        text=row['Country'],  # Country name to display
        showarrow=False,
        font=dict(size=10, color="black"),
        xanchor="center"
    )

# Update layout for readability
fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0}, showlegend=True)

# Show the plot
fig.show()




## Home work Question 3:

Explore the website - https://g7brics.streamlit.app/

