In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In this notebook we merged the country name, religion, area, population, gdp per capita and gov_type_num features. We used ISO alpha-2 code to do the merging. 

In [2]:
# Reading excel and selecting most recent data
rel_df = pd.read_excel('World Religion Dataset - National Religion Dataset.xlsx')
rel_df = rel_df.loc[rel_df['YEAR'] == '2010']

# Renaming religion columns
new_names = {'CHGENPCT': 'Christianism', 'JDGENPCT': 'Judaism', 'ISGENPCT': 'Islam', 'BUGENPCT': 'Buddhism',
             'ZOGENPCT': 'Zoroastrian', 'HIGENPCT': 'Hindu', 'SIGENPCT': 'Sikh', 'SHGENPCT': 'Shinto', 
             'BAGENPCT': "Baha'i", 'TAGENPCT': 'Taoism', 'JAGENPCT': 'Jain', 'COGENPCT': 'Confucianism', 
             'SYGENPCT': 'Syncretic religions', 'ANGENPCT': 'Animist religions', 'NORELPCT': 'Non-religious', 
             'OTGENPCT': 'Other religions'}

religion_cols = list(new_names.values())
rel_df.rename(columns=new_names, inplace=True)

# Selecing useful columns
cols = ['ISO3'] + religion_cols
rel_df = rel_df[cols]

# Reseting index
rel_df.reset_index(drop=True, inplace=True)

rel_df.head()

Unnamed: 0,ISO3,Christianism,Judaism,Islam,Buddhism,Zoroastrian,Hindu,Sikh,Shinto,Baha'i,Taoism,Jain,Confucianism,Syncretic religions,Animist religions,Non-religious,Other religions
0,USA,0.7454,0.018999,0.008999,0.010899,0.005299,0.005699,0.001299,0.0005,0.0015,0.0,0.0003,0.0003,0.002599,0.005699,0.19,0.0025
1,CAN,0.7661,0.009899,0.019399,0.019399,0.0002,0.007999,0.007999,0.0,0.0005,9.9e-05,9.9e-05,9.9e-05,0.0008,0.0021,0.1643,0.001
2,BHS,0.966,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0003,0.0,0.0,0.0,0.0032,0.028999,0.0005
3,CUB,0.6589,9.9e-05,0.0007,0.0,0.0,0.0022,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.1315,0.0
4,HTI,0.82,0.0,0.0002,0.0,0.0,0.0,0.0,0.0,0.0009,0.0,0.0,0.0,0.45,0.0,0.1,0.0


As we said, we will merge on ISO2, but we have ISO3 in rel_df. In order to map ISO3 to ISO2 codes we will use the data.pickle dataframe which contains both ISO codes.

In [3]:
# Reading dataframe used to map country codes
data = pd.read_pickle('data.pickle')
data.reset_index(inplace=True)

# Merging the two dataframes and selecting useful columns
final_rel_df = pd.merge(data, rel_df, on='ISO3')
final_rel_df = final_rel_df[['ISO2', 'name'] + religion_cols]
final_rel_df.set_index('ISO2', inplace=True)
final_rel_df.head(2)

Unnamed: 0_level_0,name,Christianism,Judaism,Islam,Buddhism,Zoroastrian,Hindu,Sikh,Shinto,Baha'i,Taoism,Jain,Confucianism,Syncretic religions,Animist religions,Non-religious,Other religions
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AF,Afghanistan,0.0003,0.0,0.9956,9.9e-05,9.9e-05,0.0003,9.9e-05,0.0,9.9e-05,0.0,0.0,0.0,0.0,9.9e-05,0.002,0.0014
AO,Angola,0.8912,0.0,0.010399,9.9e-05,0.0,0.0,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,0.075899,0.017899,0.0044


Now we want to reformat the names of the columns of the dataframe so that we end with all religion column names under the same column name: religion.

In [4]:
# Creating the tuple array for the names of the columns
religion_array = list()
for rel in final_rel_df.columns[1:]:
    religion_array.append(('religion', rel))

# Assigning the hierarchical column names
final_rel_df.columns = pd.MultiIndex.from_tuples([('name','')] + religion_array)

In [5]:
final_rel_df.head()

Unnamed: 0_level_0,name,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion
Unnamed: 0_level_1,Unnamed: 1_level_1,Christianism,Judaism,Islam,Buddhism,Zoroastrian,Hindu,Sikh,Shinto,Baha'i,Taoism,Jain,Confucianism,Syncretic religions,Animist religions,Non-religious,Other religions
ISO2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
AF,Afghanistan,0.0003,0.0,0.9956,9.9e-05,9.9e-05,0.0003,9.9e-05,0.0,9.9e-05,0.0,0.0,0.0,0.0,9.9e-05,0.002,0.0014
AO,Angola,0.8912,0.0,0.010399,9.9e-05,0.0,0.0,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,0.075899,0.017899,0.0044
AL,Albania,0.2144,0.0,0.63,0.0,0.0,0.0,0.0,0.0,0.0022,0.0,0.0,0.0,0.0,0.0,0.1507,0.0027
AD,Andorra,0.907,0.0,0.008999,0.0,0.0,0.0035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0796,0.0009
AE,United Arab Emirates,0.0714,0.0,0.6748,0.0035,0.0,0.2225,0.0,0.0,0.009999,0.0,0.0,0.0,0.0,0.0,0.013599,0.0041


In [6]:
# We set the index to ISO2
data.set_index('ISO2', inplace=True)

In [7]:
# Reading and selecting useful data
area_POP_gdp_df = data[['area', 'POP', '2016_gdp_capita']]
area_POP_gdp_df.head()

Unnamed: 0_level_0,area,POP,2016_gdp_capita
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AW,180.0,,
AF,652230.0,27000000.0,561.778746
AO,1246700.0,19114176.0,3110.808183
AI,91.0,,
AX,1580.0,,


In [8]:
# Selecting gov type dataframe
gov_type_df = data[['gov_type_num']]
gov_type_df.head()

Unnamed: 0_level_0,gov_type_num
ISO2,Unnamed: 1_level_1
AW,-1.0
AF,1.0
AO,1.0
AI,-1.0
AX,0.0


Finally, we merge all the dataframes by the index ISO2.

In [9]:
# Merging all three dataframes
df = pd.concat([final_rel_df, area_POP_gdp_df], axis=1)
df = pd.concat([df, gov_type_df], axis=1)

# Renaming columns with hierarchical names
df.columns = pd.MultiIndex.from_tuples([('name', '')] + religion_array + \
                                       [('area', ''), ('POP', ''), ('2016_gdp_capita', ''), ('gov_type_num', '')])
df.head()

Unnamed: 0_level_0,name,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,area,POP,2016_gdp_capita,gov_type_num
Unnamed: 0_level_1,Unnamed: 1_level_1,Christianism,Judaism,Islam,Buddhism,Zoroastrian,Hindu,Sikh,Shinto,Baha'i,...,Jain,Confucianism,Syncretic religions,Animist religions,Non-religious,Other religions,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,Andorra,0.907,0.0,0.008999,0.0,0.0,0.0035,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0796,0.0009,468.0,85500.0,,-1.0
AE,United Arab Emirates,0.0714,0.0,0.6748,0.0035,0.0,0.2225,0.0,0.0,0.009999,...,0.0,0.0,0.0,0.0,0.013599,0.0041,83600.0,6236650.0,37622.207458,-1.0
AF,Afghanistan,0.0003,0.0,0.9956,9.9e-05,9.9e-05,0.0003,9.9e-05,0.0,9.9e-05,...,0.0,0.0,0.0,9.9e-05,0.002,0.0014,652230.0,27000000.0,561.778746,1.0
AG,,,,,,,,,,,...,,,,,,,442.0,,14353.378814,-1.0
AI,,,,,,,,,,,...,,,,,,,91.0,,,-1.0


After merging, we get quite a few NaN values because the dataframes don't have exactly the same countries so we want to check the order of magnitude of the NaN values.

In [80]:
# Check for number of rows containing 
df[df.isnull().any(axis=1)]df.hist(figsize=(20, 100), layout=(20,3));
plt.tight_layout()

Unnamed: 0_level_0,name,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,area,POP,2016_gdp_capita,gov_type_num
Unnamed: 0_level_1,Unnamed: 1_level_1,Christianism,Judaism,Islam,Buddhism,Zoroastrian,Hindu,Sikh,Shinto,Baha'i,...,Jain,Confucianism,Syncretic religions,Animist religions,Non-religious,Other religions,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,Andorra,0.907000,0.000000,0.008999,0.0000,0.0,0.0035,0.0,0.0,0.000000,...,0.0,0.0000,0.0000,0.000000,0.079600,0.000900,468.00,85500.0,,-1.0
AG,,,,,,,,,,,...,,,,,,,442.00,,14353.378814,-1.0
AI,,,,,,,,,,,...,,,,,,,91.00,,,-1.0
AQ,,,,,,,,,,,...,,,,,,,14000000.00,,,0.0
AS,,,,,,,,,,,...,,,,,,,199.00,,,1.0
AW,,,,,,,,,,,...,,,,,,,180.00,,,-1.0
AX,,,,,,,,,,,...,,,,,,,1580.00,,,0.0
BL,,,,,,,,,,,...,,,,,,,21.00,,,0.0
BM,,,,,,,,,,,...,,,,,,,54.00,,,-1.0
BV,,,,,,,,,,,...,,,,,,,49.00,,,0.0


In [86]:
df.to_pickle('6_feature_df.pickle')

# Delete all following cells

In [97]:
df.loc[df['2016_gdp_capita'].isnull(), [('2016_gdp_capita',''), ('POP','')]]

Unnamed: 0,2016_gdp_capita,POP
,,
AD,,85500.0
AI,,
AQ,,
AS,,
AW,,
AX,,
BL,,
BM,,
BV,,


In [99]:
data = pd.read_pickle('data.pickle')
data.gov_type

name
Aruba                                    parliamentary democracy
Afghanistan                                presidential republic
Angola                                     presidential republic
Anguilla                                 parliamentary democracy
Åland Islands                                            unknown
Albania                                   parliamentary republic
Andorra                                  parliamentary democracy
United Arab Emirates                    federation of monarchies
Argentina                                  presidential republic
Armenia                                    presidential republic
American Samoa                            presidential democracy
Antarctica                                               unknown
French Southern and Antarctic Lands                      unknown
Antigua and Barbuda                      parliamentary democracy
Australia                                parliamentary democracy
Austria             