In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

In [None]:
data = pd.read_csv('all_data.csv')

In [None]:
data.count()

In [None]:
maxrows = data.Country.count()
print('Missing Data %')
print(1 - data.count()/maxrows)

In [None]:
data.head()

In [None]:
data.columns = ['Country1', 'Year', 'Life_expectancy1', 'GDP']

In [None]:
data.columns

In [None]:
data = data.rename({'Country1':'Country', 'Life_expectancy1':'Life_expectancy'}, axis=1)

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
data.GDP = data.GDP/1000000000

In [None]:
data.head()

In [None]:
print(f'''There are {data.Country.nunique()} countries in the sample and these are as follows:
{data.Country.unique()}''')

In [None]:
data.Country.replace('United States of America', 'USA', inplace=True)

In [None]:
#data.Country.unique()

In [None]:
print(f'''The analysis starts from {np.min(data.Year)} and ends at {np.max(data.Year)} meaning that the dataset
is including {data.Year.nunique()} consecutive years of GDP and Life expectancy data.''')

In [None]:
print(data.GDP.head())

In [None]:
plt.figure(figsize=[12, 6])
plt.subplot(1,2,1)
sns.barplot(x='Country', y='GDP', data=data[data.Year == np.min(data.Year)], palette='Spectral')
plt.title(f'Countries and GDP-s in year {np.min(data.Year)}', fontsize=12)
plt.xticks(rotation=30)
plt.xlabel('Countries', fontsize=10)
plt.ylabel('GDP (billion dollars)', fontsize=10)
plt.subplot(1,2,2)
sns.barplot(x='Country', y='GDP', data=data[data.Year == np.max(data.Year)], palette='Spectral')
plt.title(f'Countries and GDP-s in year {np.max(data.Year)}', fontsize=11)
plt.xticks(rotation=30)
plt.xlabel('Countries', fontsize=10)
plt.ylabel('GDP (billion dollars)', fontsize=10)
plt.subplots_adjust(wspace=0.4)
plt.show()
plt.clf()

In [None]:
plt.figure(figsize=[12, 6])
plt.subplot(1,2,1)
sns.barplot(x='Country', y='Life_expectancy', data=data[data.Year == np.min(data.Year)], palette='rocket')
plt.title(f'Countries and Life expectancy in {np.min(data.Year)}', fontsize=12)
plt.xticks(rotation=30)
plt.xlabel('Countries', fontsize=10)
plt.ylabel('Expected age (years)', fontsize=10)
plt.ylim([np.min(data.Life_expectancy), np.max(data.Life_expectancy)])
plt.subplot(1,2,2)
sns.barplot(x='Country', y='Life_expectancy', data=data[data.Year == np.max(data.Year)], palette='rocket')
plt.title(f'Countries and Life expectancy in {np.max(data.Year)}', fontsize=12)
plt.xticks(rotation=30)
plt.xlabel('Countries', fontsize=10)
plt.ylabel('Expected age (years)', fontsize=10)
plt.ylim([np.min(data.Life_expectancy), np.max(data.Life_expectancy)])
plt.subplots_adjust(wspace=0.4)
plt.show()
plt.clf()

In [None]:
plt.figure(figsize=[12, 6])
plt.subplot(1,2,1)
sns.lineplot(x='Year', y='GDP', hue='Country', data=data)
plt.title(f'GDP over {data.Year.nunique()} years in {data.Country.nunique()} countries')
plt.xlabel('Year')
plt.ylabel('GDP (billion dollars)')
xtick_val = [1998, 2002, 2006, 2010, 2014]
ytick_val = [0, 2500, 7500, 12500, 17500]
plt.xticks(xtick_val, rotation=25)
plt.yticks(ytick_val)
plt.subplot(1,2,2)
sns.lineplot(x='Year', y='Life_expectancy', hue='Country', data=data)
plt.title(f'Life expectancy over {data.Year.nunique()} years in {data.Country.nunique()} countries')
plt.xlabel('Year', fontsize=10)
plt.ylabel('Expected age (years)')
xtick_val = [1998, 2002, 2006, 2010, 2014]
ytick_val = [50, 60, 70, 80]
plt.xticks(xtick_val, rotation=25)
plt.yticks(ytick_val)
plt.subplots_adjust(wspace=0.2)
plt.show()
plt.clf()

In [None]:
chile = data[data.Country == 'Chile']
china = data[data.Country == 'China']
ger = data[data.Country == 'Germany']
mex = data[data.Country == 'Mexico']
usa = data[data.Country == 'USA']
zimb = data[data.Country == 'Zimbabwe']

In [None]:
plt.figure(figsize=[12, 18])

plt.subplot(3,2,1)
sns.scatterplot(x='GDP', y='Life_expectancy', data=chile)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {chile.Country.unique()}', fontsize=10)

plt.subplot(3,2,2)
sns.scatterplot(x='GDP', y='Life_expectancy', data=china)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {china.Country.unique()}', fontsize=10)

plt.subplot(3,2,3)
sns.scatterplot(x='GDP', y='Life_expectancy', data=ger)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {ger.Country.unique()}', fontsize=10)

plt.subplot(3,2,4)
sns.scatterplot(x='GDP', y='Life_expectancy', data=mex)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {mex.Country.unique()}', fontsize=10)

plt.subplot(3,2,5)
sns.scatterplot(x='GDP', y='Life_expectancy', data=usa)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {usa.Country.unique()}', fontsize=10)
plt.xlim([12500, 17500])

plt.subplot(3,2,6)
sns.scatterplot(x='GDP', y='Life_expectancy', data=zimb)
plt.title(f'GDP (bn $) and Life expectancy (yrs) in {zimb.Country.unique()}', fontsize=10)

plt.subplots_adjust(wspace=0.4, hspace=0.4)
plt.show()
plt.clf()

In [None]:
countries = [chile, china, ger, mex, usa, zimb]

In [None]:
for country in countries:
    sns.scatterplot(x='GDP', y='Life_expectancy', data=country)
    plt.title(f'GDP (bn $) and Life expectancy (yrs) in {country.Country.unique()[0]}')
    plt.show()
    plt.clf()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
sns.pairplot(data=data, hue="Country")
plt.show()
plt.clf()

In [None]:
sns.scatterplot(x='GDP', y='Life_expectancy', data=data, hue='Country')
plt.title(f'GDP and Life expectancy in {data.Country.nunique()} countries')
plt.xlabel('GDP (billion dollars)')
plt.ylabel('Expected age (years)')
plt.xticks(list(range(0,20000,5000)))
plt.yticks(list(range(40,90,10)))
plt.show()
plt.clf()

In [None]:
corr1, r1 = pearsonr(usa.GDP, usa.Life_expectancy)

In [None]:
print(f'''There is a strong positive relationship between the GDP and Life expectancy in the USA according to this plot.
The correlation quotient is {corr1}.''')

In [None]:
np.corrcoef(usa.GDP, usa.Life_expectancy)[0,-1]

In [None]:
corr_all, r_all = pearsonr(data.GDP, data.Life_expectancy)

In [None]:
print(f'''If we analyse the 6 countries there is also a positive linear relationship between the GDP and Life expectancy,
because the correlation quotient is {corr_all}. This is a moderate relation.''')

In [None]:
np.mean(chile.Life_expectancy)


In [None]:
np.mean(china.Life_expectancy)

In [None]:
np.mean(ger.Life_expectancy)

In [None]:
np.mean(mex.Life_expectancy)

In [None]:
np.mean(usa.Life_expectancy)

In [None]:
np.mean(zimb.Life_expectancy)

In [None]:
for country in countries:
    print(f'The average Life expectancy in years in {country.Country.unique()[0]} is {int(np.mean(country.Life_expectancy))}')

In [None]:
sns.displot(data.Life_expectancy, kde=True, color='darkblue')
plt.title(f'Life expectancy in {data.Country.nunique()} countries in {data.Year.nunique()} years', fontsize = 12)
plt.show()
plt.clf()

In [None]:
print('''Conclusions:
      
1. There is a positive relationship between GDP and Life expectancy of one country, meaning that
if the annual GDP grows, the expected age at birth will be also increasing regardless the country is
developed or developing.
2. The average expected ages are very close to each other of every nations in the sample except Zimbabwe.
3. Both the annual GDP and the Life expectancy have increased during the overall period under examination.
4. There are 2 significant increasings/outliers in the sample. The first one is the GDP of China, the second one
is the Life expectancy in Zimbabwe.''')

In [None]:
a = data.GDP[(data.Year == 2000) & (data.Country == 'China')].iloc[0]

In [None]:
a

In [None]:
b = data.GDP[(data.Year == 2015) & (data.Country == 'China')].iloc[0]

In [None]:
b/a