#  Dataviz project
## ARTAUD Lucas & SIVASUBRAMANIAM Iswarya DIA 1

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import plotly.express as px
import pycountry_convert as pc 
import plotly.graph_objects as go
import plotly.subplots as sp


## Dataset

In [2]:
df=pd.read_csv("Life Expectancy Data.csv")

In [3]:
df

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [4]:
df.shape

(2938, 22)

In [5]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [6]:
df.dropna(inplace=True)


## Context and motivation

#### The data set chosen is a statistical data on factors influencing Life Expectancy. The data comes from the World Helath Organization for a period of time of 15 years.   

#### Many studies in the past have explored the factors influencing life expectancy, centering around demographic variables, income composition, and mortality rates. However, these studies often neglected the impact of immunization and the Human Development Index. Additionally, some prior research relied on a one-year dataset for all countries but doing this research r a period of time of 15 year enables us to visualise the changes over time. This data set allows us to do a country based observation to identify the main factors that are contributing to lower life expectancy.  
#### This data-set counts health factors of 193 countries over 15 years (2000-2015). We have 22 columns and 2938 rows (befor cleaning).

#### From our preliminary analysis, population, Hepatitis B and GDP columns contains most of the missing data.We decided to remove all the missing values in order to have a clean data.

#### The motivation of our project is to analyze the different factors taht influences the life expectancy and compare on different scales (between continents, developed / developing etc...).
#### We hope to learn more about the various factors affecting life expectancy and how those findings can guide public health interventions and policies. Targeted healthcare initiatives could be guided, for instance, by identifying particular regions or demographic groups experiencing challenges with life expectancy. Furthermore, knowledge of how social determinants, economic variables, and immunizations affect life expectancy can support evidence-based decision-making at the national and international levels. 

## Visualisation

### Creation of a new column continent

With the library pycountry_convert we are going to create a new column "Continent" that will correspond to the continent of the country.

In [7]:
continent_name = {
    'AF': 'Africa',
    'AS': 'Asia',
    'EU': 'Europe',
    'NA': 'North America',
    'OC': 'Oceania',
    'SA': 'South America',
    'AN': 'Antarctica'
}

In [8]:
def convert(row):
    # convert country name to country code
    country_code =pc.country_name_to_country_alpha2(row.Country,cn_name_format="default")
    # convert country_code to continent code
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        return continent_name.get(continent_code, None)
    except :
        return None

In [9]:
df['Continent']= df.apply(convert,axis=1)



In [10]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling', 'Continent'],
      dtype='object')

### Global analysis

#### 1. Average life expectancy and population over the years

In [40]:
average_life_expectancy_yearly = df.groupby('Year')['Life expectancy '].mean().reset_index()
average_population_yearly = df.groupby('Year')['Population'].mean().reset_index()

In [53]:
fig = sp.make_subplots(rows=2, cols=1, subplot_titles=['Average Life Expectancy', 'Average Population'])

fig.add_trace(
    go.Scatter(x=average_life_expectancy_yearly['Year'], y=average_life_expectancy_yearly['Life expectancy '],
               mode='lines', name='Life Expectancy'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=average_population_yearly['Year'], y=average_population_yearly['Population'],
               mode='lines', name='Population'),
    row=2, col=1
)

fig.update_layout(
    font=dict(size=12),
    width=1200,
    height=600,
    margin=dict(l=20, r=20, t=60, b=20),
)
fig.show()

#### 2. Life Expectancy over the years of the top 10 and bottom 10 countries

In [16]:
average_life_expectancy = df.groupby('Country')['Life expectancy '].mean().reset_index() # calcultae the average life expectancy
top10_countries = average_life_expectancy.nlargest(10, 'Life expectancy ') # take the top 10 average life expectancy
bottom10_countries = average_life_expectancy.nsmallest(10, 'Life expectancy ') # take the bottom 10 average life expectancy

In [17]:
# Filter the Original DataFrame for the Selected Countries
selected_countries = top10_countries['Country'].tolist() + bottom10_countries['Country'].tolist()
filtered_df = df[df['Country'].isin(selected_countries)] 

In [26]:
fig = px.line(filtered_df, x='Year', y='Life expectancy ', color='Country',title='Life Expectancy Over the Years for the top 5 and bottom 5 Selected Countries')
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Life Expectancy',
    legend_title='Country',
    font=dict(size=15),
    width=1500,
    height=1000,
    margin=dict(l=20, r=20, t=60, b=20))

fig.show()

#### 3.Violin Plot for Life Expectancy by Continent 

In [37]:
fig = px.violin(df, x='Continent', y='Life expectancy ', color='Continent',box=True,title='Violin Plot for Life Expectancy and Population by Continent')
 
fig.update_layout(
    xaxis_title='Continent',
    yaxis_title='Life Expectancy',
    legend_title='Continent',
    font=dict(size=12),
    width=1200,
    height=800,
    margin=dict(l=20, r=20, t=60, b=20),
)

fig.show()

#### Correlation map in order to study the columns that are influencing the life expectancy

In [55]:
numeric_columns = df.select_dtypes(include=['float64']).columns # Filtering the non-numeric columns in order to do a correlation
correlation_matrix = df[numeric_columns].corr()

In [64]:
fig = px.imshow(
    correlation_matrix,
    color_continuous_scale='inferno',
    labels=dict(x='Features', y='Features', color='Correlation'),
    title='Correlation Heatmap',
    width=900,
    height=800,
)
fig.show()