# Notebook by M. Raza Khalid Saleemi
## Scatter Plot "Life Expectancy" vs " GDP per capita" with Population

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import imageio
import PIL
import matplotlib
from matplotlib.lines import Line2D
from platform import python_version
%matplotlib inline

In [2]:
print("Python version: %s" %python_version())
print("Conda version: %s" %PIL.__version__)
print("Pandas version: %s" %pd.__version__)
print("Numpy version: %s" %np.__version__)
print("matplotlib version: %s" %matplotlib.__version__)
print("imageio version: %s" %imageio.__version__)
print("PIL version: %s" %PIL.__version__)


Python version: 3.7.6
Conda version: 7.0.0
Pandas version: 1.0.1
Numpy version: 1.18.1
matplotlib version: 3.1.3
imageio version: 2.6.1
PIL version: 7.0.0


### Load required csv

In [3]:
df_total_pop = pd.read_csv("population_total.csv")
df_gdp_cap = pd.read_csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
df_life_exp = pd.read_csv("life_expectancy_years.csv")
data_region = pd.read_csv('gapminder.csv') # For loading data for region

### Selecting columns of year from 1900 to 2018

In [4]:
df_total_pop.columns

Index(['country', '1800', '1801', '1802', '1803', '1804', '1805', '1806',
       '1807', '1808',
       ...
       '2091', '2092', '2093', '2094', '2095', '2096', '2097', '2098', '2099',
       '2100'],
      dtype='object', length=302)

In [5]:
df_gdp_cap.columns

Index(['country', '1800', '1801', '1802', '1803', '1804', '1805', '1806',
       '1807', '1808',
       ...
       '2031', '2032', '2033', '2034', '2035', '2036', '2037', '2038', '2039',
       '2040'],
      dtype='object', length=242)

In [6]:
df_life_exp.columns

Index(['country', '1800', '1801', '1802', '1803', '1804', '1805', '1806',
       '1807', '1808',
       ...
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018'],
      dtype='object', length=220)

In [7]:
#Dropping columns 1800 to 1899
for i in range(1800,1900):
    df_total_pop =df_total_pop .drop(columns = str(i))
    df_life_exp = df_life_exp.drop(columns= str(i))
    df_gdp_cap =df_gdp_cap.drop(columns= str(i))

In [8]:
#Dropping columns 2019 to 2100
for i in range(2019,2101):
    df_total_pop =df_total_pop .drop(columns = str(i))
for i in range(2019,2041):
    df_gdp_cap .drop(columns = str(i),inplace = True)

In [9]:
df_gdp_cap.head()

Unnamed: 0,country,1900,1901,1902,1903,1904,1905,1906,1907,1908,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Afghanistan,793,796,798,801,804,807,809,812,815,...,1530,1610,1660,1840,1810,1780,1750,1740,1800,1870
1,Albania,1160,1170,1190,1200,1220,1240,1250,1270,1290,...,9530,9930,10200,10400,10500,10700,11000,11400,11900,12400
2,Algeria,1750,1770,1790,1810,1830,1850,1870,1890,1910,...,12600,12900,13000,13200,13300,13500,13700,14000,13800,13700
3,Andorra,3200,3230,3270,3310,3350,3390,3430,3470,3510,...,41700,39000,42000,41900,43700,44900,46600,48200,49800,51500
4,Angola,958,962,967,971,975,979,984,988,992,...,5910,5900,5910,6000,6190,6260,6230,6030,5940,5850


### Removing countries from df_total_pop and df_gdp_cap which are not in df_life_exp

In [10]:
df_life_exp.shape

(187, 120)

In [11]:
df_total_pop.shape

(195, 120)

In [12]:
df_gdp_cap.shape

(193, 120)

In [13]:
print(list(df_total_pop['country'].unique()))

['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy See', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 

In [14]:
# removing country data in df_total_pop, which is not given in df_life_exp 
index = []
for j,x in enumerate(df_total_pop['country'].unique()):
    found = False
    for i,y in enumerate(df_life_exp['country'].unique()):
        if df_life_exp['country'].unique()[i] == x:
            
            found = True
            break
    if not found:
        index += [j]
        

df_total_pop.drop(index, inplace = True)
df_total_pop.shape

(187, 120)

In [15]:
# removing country data in df_gdp_cap, which is not given in df_life_exp 
index = []
for j,x in enumerate(df_gdp_cap['country'].unique()):
    found = False
    for i,y in enumerate(df_life_exp['country'].unique()):
        if df_life_exp['country'].unique()[i] == x:
            found = True
            break
    if not found:
        index += [j]

df_gdp_cap.drop(index, inplace=True)
df_gdp_cap.shape

(187, 120)

In [16]:
df_life_exp.shape[1]

120

### Adjusting the countries data sequence

In [17]:
list_countries_col = []

for i in range(df_life_exp.shape[0]):
    list_countries_col += [df_life_exp.loc[i,'country']]*(df_life_exp.shape[1]-1)
    
print(len(list_countries_col))
print(187*119)

22253
22253


In [18]:
years = list(df_life_exp.columns[1:].astype(int))
type(years)

list

In [19]:
list_year_col = [] 
for x in range(df_life_exp.shape[0]):
    list_year_col += years
#print(list_year_col)
print(len(list_year_col))
print(187*119)

22253
22253


In [20]:
list_life_exp_col = []

for i in range(df_life_exp.shape[0]):
    list_life_exp_col += list(df_life_exp.loc[i,'1900':'2018'])

#print(list_life_exp_col)
print(len(list_life_exp_col))
print(187*119)

22253
22253


In [21]:
df_gdp_cap.reset_index(inplace= True)

In [22]:
list_gdp_col = []

for i in range(df_life_exp.shape[0]):
    list_gdp_col += list(df_gdp_cap.loc[i,'1900':'2018'])

print(len(list_gdp_col))
print(187*119)

22253
22253


In [23]:
df_total_pop.reset_index(inplace=True)

In [24]:
list_total_pop_col = []

for i in range(df_life_exp.shape[0]):
    list_total_pop_col += list(df_total_pop.loc[i,'1900':'2018'])

print(len(list_total_pop_col))
print(187*119)

22253
22253


In [25]:
list_total_pop_mil_col = []
for i,x in enumerate(list_total_pop_col):
    list_total_pop_mil_col.append(x/1000000)

### making new DataFrame

In [26]:
data = {'country': list_countries_col ,'life_exp': list_life_exp_col, 'gdp' :  list_gdp_col, 'pop_mill': list_total_pop_mil_col, 'year': list_year_col}
newDF = pd.DataFrame(data)
newDF

Unnamed: 0,country,life_exp,gdp,pop_mill,year
0,Afghanistan,29.2,793,5.02,1900
1,Afghanistan,29.3,796,5.05,1901
2,Afghanistan,29.3,798,5.09,1902
3,Afghanistan,29.4,801,5.12,1903
4,Afghanistan,29.4,804,5.15,1904
...,...,...,...,...,...
22248,Zimbabwe,57.0,1910,15.40,2014
22249,Zimbabwe,58.3,1890,15.80,2015
22250,Zimbabwe,59.3,1860,16.20,2016
22251,Zimbabwe,59.8,1910,16.50,2017


## Defining function to save the plots and make Gifs

In [27]:
def fig2img ( fig ):
    """
    @brief Convert a Matplotlib figure to a PIL Image in RGBA format and return it
    @param fig a matplotlib figure
    @return a Python Imaging Library ( PIL ) image
    """
    # put the figure pixmap into a numpy array
    buf = fig2data ( fig )
    w, h, d = buf.shape
    im=PIL.Image.fromstring( "RGBA", ( w ,h ), buf.tostring())
    return im.convert(mode="RGB")


In [28]:
VALID_EXTENSIONS = ('png', 'jpg')
def create_gif(filenames, duration):
    images = []
    for filename in filenames:
        images.append(imageio.imread(filename))
    output_file = 'Gif-%s.gif' % datetime.datetime.now().strftime('%Y-%M-%d-%H-%M-%S')
    imageio.mimsave(output_file, images, duration=duration)


# Selecting Axes Limits

In [29]:
newDF.describe()

Unnamed: 0,life_exp,gdp,pop_mill,year
count,22037.0,22253.0,22253.0,22253.0
mean,52.479093,7416.622298,19.493662,1959.0
std,16.677604,13108.623558,83.701224,34.3519
min,1.09,247.0,0.00439,1900.0
25%,35.7,1320.0,0.76,1929.0
50%,53.5,2790.0,3.35,1959.0
75%,67.8,7630.0,10.2,1989.0
max,84.2,178000.0,1420.0,2018.0


In [30]:
newDF.describe()['life_exp']['max']

84.2

### Selecting Axes Limits: X-Axis

In [31]:
newDF.describe()['gdp']['min'],newDF.describe()['gdp']['max']

(247.0, 178000.0)

### Selecting Axes Limits: Y-Axis

In [32]:
newDF.describe()['life_exp']['min'],newDF.describe()['life_exp']['max']

(1.09, 84.2)

## Adding Column for Region

In [33]:
data_region.head()

Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [34]:
data_region[data_region['Country'] == 'Pakistan'].reset_index()['region'][0]

'South Asia'

In [35]:
regions= {}
for i,x in enumerate(set(data_region['Country'])):
    regions[x] =  data_region[data_region['Country'] == x].reset_index()['region'][0]

In [36]:
regions['Australia']

'East Asia & Pacific'

In [37]:
list(set(data_region['region']))


['America',
 'Europe & Central Asia',
 'Middle East & North Africa',
 'Sub-Saharan Africa',
 'East Asia & Pacific',
 'South Asia']

In [38]:
newDF['region']= " "
newDF.head()

Unnamed: 0,country,life_exp,gdp,pop_mill,year,region
0,Afghanistan,29.2,793,5.02,1900,
1,Afghanistan,29.3,796,5.05,1901,
2,Afghanistan,29.3,798,5.09,1902,
3,Afghanistan,29.4,801,5.12,1903,
4,Afghanistan,29.4,804,5.15,1904,


In [39]:
for i,x in enumerate(newDF['country']):
    try:
        newDF.region[i] = regions[x]
    except:
        newDF.region[i] = np.nan
        
newDF.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,country,life_exp,gdp,pop_mill,year,region
0,Afghanistan,29.2,793,5.02,1900,South Asia
1,Afghanistan,29.3,796,5.05,1901,South Asia
2,Afghanistan,29.3,798,5.09,1902,South Asia
3,Afghanistan,29.4,801,5.12,1903,South Asia
4,Afghanistan,29.4,804,5.15,1904,South Asia


In [40]:
newDF[newDF['region'].isnull()]

Unnamed: 0,country,life_exp,gdp,pop_mill,year,region
357,Andorra,,3200,0.00439,1900,
358,Andorra,,3230,0.00442,1901,
359,Andorra,,3270,0.00445,1902,
360,Andorra,,3310,0.00447,1903,
361,Andorra,,3350,0.00450,1904,
...,...,...,...,...,...,...
22010,Yemen,68.4,3770,26.20000,2014,
22011,Yemen,67.2,2640,26.90000,2015,
22012,Yemen,66.7,2330,27.60000,2016,
22013,Yemen,66.9,2380,28.30000,2017,


### To find 'region' for some countries still having unknown region

In [41]:
#we will be using this
import pycountry_convert as pc  #pip install pycountry-convert

country_code = pc.country_name_to_country_alpha2("Andorra", cn_name_format="default")
print(country_code)
continent_name = pc.country_alpha2_to_continent_code(country_code)
print(continent_name)

AD
EU


In [42]:
unknown_region_country= newDF[newDF['region'].isnull()]['country']
unknown_region_country

357      Andorra
358      Andorra
359      Andorra
360      Andorra
361      Andorra
          ...   
22010      Yemen
22011      Yemen
22012      Yemen
22013      Yemen
22014      Yemen
Name: country, Length: 1666, dtype: object

In [43]:
unknown_region_country_index = newDF[newDF['region'].isnull()].reset_index()['index']
unknown_region_country_index

0         357
1         358
2         359
3         360
4         361
        ...  
1661    22010
1662    22011
1663    22012
1664    22013
1665    22014
Name: index, Length: 1666, dtype: int64

In [44]:
newDF.loc[unknown_region_country_index[0], 'country']

'Andorra'

In [45]:
def Cont_Name(name):
    if name == 'OC':
        return 'East Asia & Pacific'
    if name == 'EU':
        return 'Europe & Central Asia'
    if name == 'NA':
        return 'America'
Cont_Name('OC')

'East Asia & Pacific'

In [46]:
#Using pycountry
for i,x in enumerate(unknown_region_country):
    try:
        country_code = pc.country_name_to_country_alpha2(x, cn_name_format="default")
        continent_name = pc.country_alpha2_to_continent_code(country_code)
        newDF.loc[unknown_region_country_index[i], 'region']=Cont_Name(continent_name)

    except:
        pass
 

### Some countries still not have region assigned

In [47]:
#countries still left
set(newDF[newDF['region'].isnull()]['country'])

{'Central African Republic',
 'Kyrgyz Republic',
 'Lao',
 'North Korea',
 'Palestine',
 'South Korea',
 'St. Vincent and the Grenadines',
 'Yemen'}

In [48]:
stillunknown_region_country= newDF[newDF['region'].isnull()]['country']
stillunknown_region_country

3808     Central African Republic
3809     Central African Republic
3810     Central African Republic
3811     Central African Republic
3812     Central African Republic
                   ...           
22010                       Yemen
22011                       Yemen
22012                       Yemen
22013                       Yemen
22014                       Yemen
Name: country, Length: 952, dtype: object

In [49]:
stillunknown_region_country_index = newDF[newDF['region'].isnull()].reset_index()['index']
stillunknown_region_country_index

0       3808
1       3809
2       3810
3       3811
4       3812
       ...  
947    22010
948    22011
949    22012
950    22013
951    22014
Name: index, Length: 952, dtype: int64

In [50]:
def Country_Manual_Assign(name):
    if name == 'Central African Republic':
        return 'Sub-Saharan Africa'
    if name == 'Kyrgyz Republic':
        return 'Europe & Central Asia'
    if name == 'Lao':
        return 'East Asia & Pacific'
    if name == 'North Korea':
        return 'East Asia & Pacific'
    if name == 'Palestine':
        return 'Middle East & North Africa'
    if name == 'South Korea':
         return 'East Asia & Pacific'
    if name == 'St. Vincent and the Grenadines':
        return 'America'
    if name == 'Yemen':
        return 'Middle East & North Africa'

In [51]:
for i,x in enumerate(stillunknown_region_country):
    newDF.loc[stillunknown_region_country_index[i], 'region']=Country_Manual_Assign(x)
    
#countries still left
set(newDF[newDF['region'].isnull()]['country'])

set()

## Color of the Regions

In [52]:
col = {'South Asia':'red','Europe & Central Asia':'green','Middle East & North Africa':'blue','Sub-Saharan Africa':'yellow',
       'America':'orange','East Asia & Pacific':'black'}

In [53]:
list(col)

['South Asia',
 'Europe & Central Asia',
 'Middle East & North Africa',
 'Sub-Saharan Africa',
 'America',
 'East Asia & Pacific']

In [54]:
newDF['color']=newDF['region'].map(col)
newDF.head()

Unnamed: 0,country,life_exp,gdp,pop_mill,year,region,color
0,Afghanistan,29.2,793,5.02,1900,South Asia,red
1,Afghanistan,29.3,796,5.05,1901,South Asia,red
2,Afghanistan,29.3,798,5.09,1902,South Asia,red
3,Afghanistan,29.4,801,5.12,1903,South Asia,red
4,Afghanistan,29.4,804,5.15,1904,South Asia,red


# Scatter Plot

In [55]:
filenames = []
fig = plt.figure(figsize=(15,10))

Title_font = {'family': 'Times New Roman', 'color':  'black', 'weight': 'normal', 'size': 40 }
for i in range(1900,2019):
    
    # Plotting 
    df_now = newDF[newDF['year']==i]
    plt.scatter(df_now['gdp'], df_now['life_exp'],s= 4* df_now['pop_mill'], c= df_now['color'], alpha =0.7)
    
    # Setting axes and labels
    plt.xscale('log')
    plt.xlabel('GDP per Capita [in USD]', fontsize=20)
    plt.ylabel('Life Expectancy [in years]', fontsize=20)
    plt.title('World Development In Year ' + str(i) + " ", fontsize= 35)
    tick_val = [1000,10000,100000]
    tick_lab = ['1k','10k','100k']
    plt.xticks(tick_val,tick_lab)
    plt.xlim(newDF.describe()['gdp']['min'],newDF.describe()['gdp']['max'])
    plt.ylim(newDF.describe()['life_exp']['min'],100)
    #plt.grid(b=None)
    
    
    #for legend 
    r_circle = Line2D([0], [0], marker='o', color='w', label='South Asia', markerfacecolor='red', markersize=16, linestyle= "", markeredgecolor= 'none')
    g_circle = Line2D([0], [0], marker='o', color='w', label='Europe & Central Asia', markerfacecolor='green', markersize=16, linestyle= "", markeredgecolor= 'none')
    b_circle = Line2D([0], [0], marker='o', color='w', label='Middle East & North Africa', markerfacecolor='blue', markersize=16, linestyle= "", markeredgecolor= 'none')
    y_circle = Line2D([0], [0], marker='o', color='w', label='Sub-Saharan Africa', markerfacecolor='yellow', markersize=16, linestyle= "", markeredgecolor= 'none')
    o_circle = Line2D([0], [0], marker='o', color='w', label='America', markerfacecolor='orange', markersize=16, linestyle= "", markeredgecolor= 'none')
    bk_circle = Line2D([0], [0], marker='o', color='w', label='East Asia & Pacific', markerfacecolor='black', markersize=16, linestyle= "", markeredgecolor= 'none')
    plt.legend(handles=[r_circle, g_circle, b_circle, y_circle, o_circle, bk_circle],loc="lower right",fontsize=16)
    
    #Year as a text
    font_year = {'family': 'Lucida Console', 'color':  'green', 'weight': 'normal', 'size': 150 }
    plt.text(300, 75, str(i), fontdict = font_year, alpha =0.3)
    font_name = {'family': 'Arial', 'color':  'black', 'weight': 'normal', 'size': 15 }
    plt.text(300, 2, 'Made by: Muhammad Raza Khalid Saleemi', fontdict = font_name)
    font_pop= {'family': 'Arial', 'color':  'green', 'weight': 'normal', 'size': 25 }
    plt.text(400, 70, 'Bubble Size : Population ', fontdict = font_pop, alpha =0.5)
    
    filename=str(i)+'.png'
    filenames.append(filename) 
    plt.savefig(filename, dpi=96)
    plt.gca()
    fig.clf()
duration =0.2
for i in range(5):
    filenames.append("2018.png")
create_gif(filenames,duration)

<Figure size 1080x720 with 0 Axes>

# Acknowledgement:  Dice Analytics (https://diceanalytics.pk/)