# Life Expectancy Analysis

In [56]:
# Dependencies and setup

import pandas as pd
import requests
import matplotlib.pyplot as plt

### Life Expectancy Dataset loaded and cleaned

In [57]:
Life_Exp_Male_df = pd.read_csv('Resources/WHO Life Expectancy Male.csv')
Life_Exp_Female_df = pd.read_csv('Resources/WHO Life Expectancy Female.csv')

# Rows with little or no data were removed from CSV file, easier to do in Excel. 

# Columns need to be converted to row data
Life_Exp_Male_df = pd.melt(Life_Exp_Male_df, id_vars = 'Country', var_name = 'Year', value_name = 'Life Expectancy Male')
Life_Exp_Female_df = pd.melt(Life_Exp_Female_df, id_vars = 'Country', var_name = 'Year', value_name = 'Life Expectancy Female')

# Merge Male / Female Data 
Life_Exp_df = pd.merge(Life_Exp_Male_df, Life_Exp_Female_df, on = ['Country', 'Year'])

# Reduce decimal places to 1
Life_Exp_df['Life Expectancy Male'] = Life_Exp_df['Life Expectancy Male'].round(1)
Life_Exp_df['Life Expectancy Female'] = Life_Exp_df['Life Expectancy Female'].round(1)

# Change any NaN values to ""
Life_Exp_df.fillna('', inplace = True)

Life_Exp_df

Unnamed: 0,Country,Year,Life Expectancy Male,Life Expectancy Female
0,Aruba,1960,61,68
1,Africa Eastern and Southern,1960,42,46
2,Afghanistan,1960,32,33
3,Africa Western and Central,1960,36,39
4,Angola,1960,37,40
...,...,...,...,...
16060,Kosovo,2022,,
16061,"Yemen, Rep.",2022,,
16062,South Africa,2022,,
16063,Zambia,2022,,


### Population Dataset loaded and cleaned

In [58]:
Population_df = pd.read_csv('Resources/WHO Population.csv')

# Columns need to be converted to row data
Population_df = pd.melt(Population_df, id_vars = 'Country', var_name = 'Year', value_name = 'Population')

# Merge Population with Main dataset
Life_Exp_df = pd.merge(Life_Exp_df, Population_df, on = ['Country', 'Year'])

Life_Exp_df

Unnamed: 0,Country,Year,Life Expectancy Male,Life Expectancy Female,Population
0,Aruba,1960,61,68,54608
1,Africa Eastern and Southern,1960,42,46,130692579
2,Afghanistan,1960,32,33,8622466
3,Africa Western and Central,1960,36,39,97256290
4,Angola,1960,37,40,5357195
...,...,...,...,...,...
16060,Kosovo,2022,,,1761985
16061,"Yemen, Rep.",2022,,,33696614
16062,South Africa,2022,,,59893885
16063,Zambia,2022,,,20017675


In [59]:
# How many countries in dataset?
Life_Exp_df['Country'].nunique()

255

### GDP Dataset loaded and cleaned

In [60]:
GDP_df = pd.read_csv('Resources/WHO GDP.csv')

# Columns need to be converted to row data
GDP_df = pd.melt(GDP_df, id_vars = 'Country', var_name = 'Year', value_name = 'GDP')

# Change any NaN values to ""
GDP_df.fillna('', inplace = True)

Display_df = pd.DataFrame(GDP_df)
pd.set_option('display.float_format', '{:,.0f}'.format)

GDP_df

Unnamed: 0,Country,Year,GDP
0,Aruba,1960,
1,Africa Eastern and Southern,1960,21125015452
2,Africa Western and Central,1960,10447637853
3,Angola,1960,
4,Albania,1960,
...,...,...,...
14485,Samoa,2022,832421565
14486,"Yemen, Rep.",2022,
14487,South Africa,2022,405870000000
14488,Zambia,2022,29784454056


In [61]:
# How many countries in GDP dataset?
GDP_df['Country'].nunique()

230

### Clean Water loaded and cleaned

In [75]:
Clean_Water_df = pd.read_csv('Resources/WHO Access to Clean Water.csv')
Clean_Water_df.head()

Clean_Water_df = pd.melt(Clean_Water_df, id_vars = 'Country', var_name = 'Year', value_name = '% Population using Clean Water')

Clean_Water_df.head()

Unnamed: 0,Country,Year,% Population using Clean Water
0,Afghanistan,2000,11
1,Africa Western and Central,2000,15
2,Albania,2000,49
3,Andorra,2000,91
4,Armenia,2000,80


In [76]:
Clean_Water_df['Country'].nunique()

145

### Sanitation loaded and cleaned

In [74]:
Sanitation_df = pd.read_csv('Resources/WHO Access to Sanitation.csv')
Sanitation_df.head()

Sanitation_df = pd.melt(Sanitation_df, id_vars = 'Country', var_name = 'Year', value_name = '% Population using Sanitation')

Sanitation_df.head()

Unnamed: 0,Country,Year,% Population using Sanitation
0,Africa Western and Central,2000,16.0
1,Albania,2000,40.0
2,Andorra,2000,15.0
3,Arab World,2000,39.0
4,United Arab Emirates,2000,


In [73]:
Sanitation_df['Country'].nunique()

151

In [81]:
Clean_Water_Sanitation_df = pd.merge(Clean_Water_df, Sanitation_df, on = ['Country', 'Year'])
Clean_Water_Sanitation_df.head()

Unnamed: 0,Country,Year,% Population using Clean Water,% Population using Sanitation
0,Africa Western and Central,2000,15,16
1,Albania,2000,49,40
2,Andorra,2000,91,15
3,Armenia,2000,80,46
4,Austria,2000,96,100


In [82]:
Clean_Water_Sanitation_df['Country'].nunique()

116

### OECD GDP Growth Dataset loaded and cleaned

In [None]:
# Load data into dataframe
GDP_Growth_df = pd.read_csv('Resources/OECD GDP Growth Data.csv')

In [None]:
# Removing rows where '1980' column has the value "no data"
GDP_Growth_df = GDP_Growth_df[(GDP_Growth_df['2000'] != 'no data') & (~GDP_Growth_df['2000'].isna())]

# Remove columns outside of 2000 - 2015
GDP_Growth_df = GDP_Growth_df[['Country', '2000', '2001', '2002', '2003', '2004', '2005', '2006','2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]

In [None]:
# Reshape the GDP_Growth_df so that years are listed in rows instead of columns

GDP_Growth_df_melted = pd.melt(GDP_Growth_df, id_vars = 'Country', var_name = 'Year', value_name = 'GDP Growth %')

# Convert the 'Year' column to integers
GDP_Growth_df_melted['Year'] = GDP_Growth_df_melted['Year'].astype(int)

GDP_Growth_df_melted.head()

In [None]:
# How many countries in dataset?
GDP_Growth_df_melted['Country'].nunique()

### Line Chart showing life expectancy over time

In [None]:

# Grouping by 'Year' and 'Status' and calculating average life expectancy
m_grouped_df = Life_Exp_df.groupby(['Year'])['Life Expectancy Male'].mean().reset_index()
f_grouped_df = Life_Exp_df.groupby(['Year'])['Life Expectancy Female'].mean().reset_index()

# Creating the line chart
plt.figure(figsize =(10, 6))  

# Line plot for Males
plt.plot(m_grouped_df['Year'], m_grouped_df['Life Expectancy'], label='males', marker='o')

# Line plot for Females
plt.plot(f_grouped_df['Year'], f_grouped_df['Life Expectancy'], label='females', marker='o')

plt.xlabel('Year')
plt.ylabel('Average Life Expectancy')
plt.title('Average Life Expectancy by Year and Sex')
plt.ylim(0, 100)
plt.legend()
plt.xticks(rotation=45) 

plt.show()

### Scatter Plot showing Life Expectancy vs. Population

In [None]:
# Group by 'Country' and calculate the average 'Life Expectancy' and total 'Population'
grouped_df = Life_Exp_df.groupby('Country').agg({
    'Life Expectancy': 'mean',
    'Population': 'max'
}).reset_index()

# Reverse the x and y-axis data
x = grouped_df['Life Expectancy']
y = grouped_df['Population']

# Convert population values to millions for labeling
y_labels_millions = [val / 1000000 for val in y]

plt.figure(figsize=(10, 6))  # Adjust the size of the plot as needed.
plt.scatter(x, y, alpha=0.5)  # 'alpha' controls the transparency of the points.

plt.xlabel('Life Expectancy')
plt.ylabel('Population (Millions)')  # Label the y-axis with population in millions
plt.title('Scatter Plot: Population vs. Life Expectancy by Country')

plt.grid(True)  # Add a grid to the plot.
plt.tight_layout()  # Improve spacing between elements.

# Format y-axis labels to display population in millions
plt.gca().get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

plt.show()  # Display the plot.

In [None]:
Outliers_df = grouped_df[Life_Exp_df['Population'] > 40000000]
Outliers_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress

# Assuming you have a DataFrame named 'Life_Exp_Report_df' with columns 'Country', 'BMI', and 'Life Expectancy'

# Calculate average BMI and Life Expectancy grouped by 'Country'
avg_data = Life_Exp_Report_df.groupby('Country').mean([['BMI', 'Life Expectancy']])

x_avg = avg_data['BMI']
y_avg = avg_data['Life Expectancy']

plt.figure(figsize=(10, 6))  
plt.scatter(x_avg, y_avg, alpha=0.5)  

plt.xlabel('BMI')
plt.ylabel('Life Expectancy')
plt.title('Scatter Plot: Average BMI vs. Average Life Expectancy by Country')

# Calculate the linear regression
slope, intercept, r_value, p_value, std_err = linregress(x_avg, y_avg)

# Create the linear regression line using the calculated slope and intercept
regression_line = slope * x_avg + intercept

# Plot the linear regression line
plt.plot(x_avg, regression_line, color='red', linewidth=2)

plt.grid(True)
plt.tight_layout()

# Format y-axis labels to display population in millions (if needed)
# plt.gca().get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

plt.show()

In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
# Load the built-in world dataset from geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Generate sample data for visualization
data = {
    'country': ['United States', 'Canada', 'Mexico'],
    'gdp_value': ['United States', 'Canada', 'Mexico']
}
gdp_df = pd.DataFrame(data)
# Merge GeoDataFrame with DataFrame based on country names
merged_gdf = world.merge(gdp_df, left_on='name', right_on='country', how='left')
# Set up the figure and axis
fig, ax = plt.subplots(1, figsize=(12, 8))
# Plot the choropleth map
merged_gdf.plot(column='gdp_value', cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
# Customize the plot (optional)
ax.set_title('World GDP Choropleth Map')
ax.axis('off')
# Show the plot
plt.show()
