In [45]:
import pandas as pd
import altair as alt
import pycountry as pc
from pycountry_convert import (
    country_alpha2_to_continent_code,
    country_alpha3_to_country_alpha2,
    country_name_to_country_alpha2
)

# Dataset preparation

In [46]:
dataset_df = pd.read_csv("../data/raw/Life Expectancy Data.csv")

In [47]:
#dataset_df = pd.read_csv("../data/processed/life_expectancy_data_processed.csv")

In [48]:
dataset_df["year"].unique

<bound method Series.unique of 0       2015
1       2014
2       2013
3       2012
4       2011
        ... 
2933    2004
2934    2003
2935    2002
2936    2001
2937    2000
Name: year, Length: 2938, dtype: int64>

In [49]:
dataset_df=dataset_df.copy()
dataset_df.loc[dataset_df["country"] == "Bolivia (Plurinational State of)", "country"] = "Bolivia, Plurinational State of"
dataset_df.loc[dataset_df["country"] == "Iran (Islamic Republic of)", "country"] = "Iran, Islamic Republic of"
dataset_df.loc[dataset_df["country"] == "Micronesia (Federated States of)", "country"] = "Micronesia, Federated States of"
dataset_df.loc[dataset_df["country"] == "Republic of Korea", "country"] = "Korea, Republic of"
dataset_df.loc[dataset_df["country"] == "The former Yugoslav republic of Macedonia", "country"] = "North Macedonia"
dataset_df.loc[dataset_df["country"] == "Venezuela (Bolivarian Republic of)", "country"] = "Venezuela, Bolivarian Republic of"
continent = []
continent_map = {
    "AS": "Asia",
    "EU": "Europe",
    "AF": "Africa",
    "NA": "North America",
    "SA": "South America",
    "OC": "Oceania",
}
for c in dataset_df.country.tolist():
    if country_name_to_country_alpha2(c) == "TL":
        continent.append(continent_map[country_alpha2_to_continent_code("TP")])
    else:
        continent.append(continent_map[country_alpha2_to_continent_code(country_name_to_country_alpha2(c))])
continent_df = pd.DataFrame(data={"country": dataset_df.country.tolist(), "continent": continent})

In [50]:
dataset_df = dataset_df.assign(continent = continent_df.continent)

In [51]:
dataset_df

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_B,measles,...,total_expenditure,diphtheria,hiv_aids,gdp,population,thinness_1_19_years,thinness_5_9_years,income_composition_of_resources,schooling,continent
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,Asia
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,Asia
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,Asia
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,Asia
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2,Africa
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5,Africa
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0,Africa
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8,Africa


# Cards - For Summary

## Worldwide

In [52]:
temp = dataset_df[dataset_df["year"].isin([2000, 2015, 2010, 2005])]
temp = temp.groupby("year").mean()[["life_expectancy"]].T
temp = temp.assign(perc_last_5 = (temp[2015] - temp[2010])/temp[2010]*100)
temp = temp.assign(perc_last_10 = (temp[2015] - temp[2005])/temp[2005]*100)
temp = temp.assign(perc_last_15 = (temp[2015] - temp[2000])/temp[2000]*100)
temp

year,2000,2005,2010,2015,perc_last_5,perc_last_10,perc_last_15
life_expectancy,66.750273,68.20929,70.048634,71.61694,2.238882,4.995874,7.290857


## Continent wise

In [53]:
temp = dataset_df[dataset_df["year"].isin([2000, 2015, 2010, 2005])]
temp = temp.groupby(["continent", "year"]).mean()[["life_expectancy"]].reset_index()
temp = temp.pivot(index="continent", columns="year", values="life_expectancy")
temp = temp.assign(perc_last_5 = (temp[2015] - temp[2010])/temp[2010]*100)
temp = temp.assign(perc_last_10 = (temp[2015] - temp[2005])/temp[2005]*100)
temp = temp.assign(perc_last_15 = (temp[2015] - temp[2000])/temp[2000]*100)
temp

year,2000,2005,2010,2015,perc_last_5,perc_last_10,perc_last_15
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,54.72037,57.12963,60.075926,62.666667,4.312444,9.692058,14.521642
Asia,69.044681,70.351064,72.114894,73.185106,1.484038,4.028429,5.996734
Europe,75.148718,76.458974,78.138462,79.041026,1.155083,3.377041,5.179473
North America,72.680952,73.07619,73.142857,75.014286,2.558594,2.652157,3.210378
Oceania,69.42,70.72,72.05,72.16,0.152672,2.036199,3.946989
South America,72.0,72.258333,73.458333,75.225,2.404991,4.105639,4.479167


## By Continent - Trend

In [54]:
temp = dataset_df[dataset_df["year"].isin([2000, 2015, 2010, 2005])]
temp = temp.groupby(["status", "year"]).mean()[["life_expectancy"]].reset_index()
temp = temp.pivot(index="status", columns="year", values="life_expectancy")
temp = temp.assign(perc_last_5 = (temp[2015] - temp[2010])/temp[2010]*100)
temp = temp.assign(perc_last_10 = (temp[2015] - temp[2005])/temp[2005]*100)
temp = temp.assign(perc_last_15 = (temp[2015] - temp[2000])/temp[2000]*100)
temp

year,2000,2005,2010,2015,perc_last_5,perc_last_10,perc_last_15
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Developed,76.803125,78.590625,80.146875,80.709375,0.701836,2.695932,5.086056
Developing,64.619868,66.009272,67.908609,69.690066,2.623315,5.576178,7.846192


In [55]:
a = (
    alt.Chart(
        dataset_df.groupby(["continent", "year"])
        .mean()["life_expectancy"]
        .reset_index()
    )
    .mark_line()
    .encode(
        x="year",
        y=alt.Y("sum(life_expectancy)", scale=alt.Scale(zero=False)),
        color="continent",
        tooltip="continent",
    )
)
a

## Country vs same continent vs rest of the world - Trend

In [56]:
chosen_country = "Canada"
chosen_year = 2000
sel_continent = dataset_df[dataset_df["country"] == chosen_country].head(1).continent.tolist()[0]

In [57]:
temp = (
    dataset_df.groupby("year")
    .mean()["life_expectancy"]
    .reset_index()
    .assign(label="Worldwide")
)
temp = pd.concat(
    [
        temp,
        dataset_df[dataset_df["continent"] == sel_continent]
        .groupby("year")
        .mean()["life_expectancy"]
        .reset_index()
        .assign(label=sel_continent)
    ],
    ignore_index=True,
)
temp = pd.concat(
    [
        temp,
        dataset_df.loc[
            dataset_df["country"] == chosen_country, ["year", "life_expectancy"]
        ].assign(label=chosen_country),
    ],
    ignore_index=True,
)
alt.Chart(temp[temp["year"] >= chosen_year]).mark_line().encode(
    x="year",
    y="life_expectancy",
    color="label"
)

## Developed vs Developing

In [75]:
b = alt.Chart(dataset_df.groupby(["status", "year"]).mean()["life_expectancy"].reset_index()).mark_line().encode(
    x=alt.X("year:N", axis=alt.Axis(labelAngle=0)),
    y=alt.Y("sum(life_expectancy)",scale=alt.Scale(zero=False)),
    color="status",
    tooltip="status"
)
b

## Effect of other factors

In [59]:
alt.Chart(
    dataset_df.query("year == 2003")
).mark_circle(size=100).encode(
    x=alt.X("BMI"),
    y=alt.Y("life_expectancy", title="Life Expectancy", scale=alt.Scale(zero=False)),
    color="status",
    #size=alt.Value("5"),
    tooltip="country",
)

In [60]:
dataset_df.to_csv("../data/processed/life_expectancy_data_processed.csv", index=False)

In [61]:
dataset_df

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_B,measles,...,total_expenditure,diphtheria,hiv_aids,gdp,population,thinness_1_19_years,thinness_5_9_years,income_composition_of_resources,schooling,continent
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,Asia
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,Asia
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,Asia
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,Asia
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2,Africa
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5,Africa
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0,Africa
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8,Africa


In [69]:
continent = dataset_df.continent.unique().tolist()
continent = ["Asia"]

In [70]:
dataset_df[dataset_df["continent"].isin(continent)]

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_B,measles,...,total_expenditure,diphtheria,hiv_aids,gdp,population,thinness_1_19_years,thinness_5_9_years,income_composition_of_resources,schooling,continent
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,Asia
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,Asia
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,Asia
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,Asia
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2901,Yemen,2004,Developing,62.2,247.0,42,0.06,0.000000,43.0,12708,...,4.90,72.0,0.1,,,13.9,13.9,0.464,8.4,Asia
2902,Yemen,2003,Developing,61.9,249.0,43,0.04,0.000000,38.0,8536,...,5.00,61.0,0.1,,,14.0,13.9,0.457,8.2,Asia
2903,Yemen,2002,Developing,61.5,25.0,45,0.07,0.000000,31.0,890,...,4.22,65.0,0.1,,,14.0,14.0,0.450,8.0,Asia
2904,Yemen,2001,Developing,61.1,251.0,46,0.08,0.000000,19.0,485,...,4.34,73.0,0.1,,,14.0,14.0,0.444,7.9,Asia


In [76]:
dataset_df.columns

Index(['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_expenditure', 'hepatitis_B',
       'measles', 'BMI', 'under_five_deaths', 'polio', 'total_expenditure',
       'diphtheria', 'hiv_aids', 'gdp', 'population', 'thinness_1_19_years',
       'thinness_5_9_years', 'income_composition_of_resources', 'schooling',
       'continent'],
      dtype='object')