In [110]:
import pandas as pd
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
import collections
import re

## Dataset 1: https://github.com/mledoze/countries

In [111]:
def get_language(df):
    """Takes the dataframe and, from the original column dict_of_languages, returns a dictionary where items are like 
       {country: list_of_language_names}"""
    languages = dict()
    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        languages.update({country: list(language_dict.values())})
    
    return languages

            
def get_language_codes(df):
    """Takes the dataframe and, from the original column dict_of_languages, returns a dictionary where items are like 
       {country: list_of_language_codes}"""
    
    language_codes = dict()
    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        if len(language_dict.keys()) > 0:
            language_codes.update({country: list(language_dict.keys())})
        else:
            language_codes.update({country: ['unknown']})
    
    return language_codes

def get_country_name_native(df):
    """Takes the dataframe and, from the original column dict_of_names, returns a dictionary where items are like 
       {country: list_of_native_country_names}. 
       The dict_of_names column conains the name of the country in various ways. This function only takes the common
       name of the country in the native language (i.e. we could also take the official name in the native language)"""
    
    native_names_dict = dict()
    for country, country_name_dict in zip(df.name, df.dict_of_names.values):
        native_country_names = []
        for native_lang in country_name_dict['native']:
            native_country_names.append(country_name_dict['native'][native_lang]['common'])
        native_names_dict.update({country: native_country_names})
    
    return native_names_dict
            

def add_columns(df):
    """Adds the following columns to the dataframe: name, name_native, languages and language_codes"""
    
    # Adding name & name_native columns
    df['name'] = [name['common'] for name in df.dict_of_names.values]
    country_native_names = get_country_name_native(df)
    df['name_native'] = list(country_native_names.values())
    
    # Getting the dictionaries needed to add the language columns
    languages = get_language(df)
    language_codes = get_language_codes(df)
    
    # Adding languages & language codes columns
    df['languages'] = list(languages.values())
    df['language_codes'] = list(language_codes.values())

    return df

In [112]:
# Read csv and renaming columns
countries = pd.read_json('countries.json')
countries.rename(columns={'name': 'dict_of_names', 'languages': 'dict_of_languages'}, inplace=True)

# Adding columns
countries = add_columns(countries)
# Select useful columns & renaming them
cols = ['area', 'cca2', 'cca3', 'ccn3', 'borders', 'name', 'language_codes', 'latlng', 'languages', 'name_native']
countries = countries[cols]
countries.rename(columns={'cca2': 'ISO2', 'cca3': 'ISO3', 'ccn3': 'ISO_num'}, inplace=True)

countries.head()

Unnamed: 0,area,ISO2,ISO3,ISO_num,borders,name,language_codes,latlng,languages,name_native
0,180.0,AW,ABW,533,[],Aruba,"[nld, pap]","[12.5, -69.96666666]","[Dutch, Papiamento]","[Aruba, Aruba]"
1,652230.0,AF,AFG,4,"[IRN, PAK, TKM, UZB, TJK, CHN]",Afghanistan,"[prs, pus, tuk]","[33, 65]","[Dari, Pashto, Turkmen]","[افغانستان, افغانستان, Owganystan]"
2,1246700.0,AO,AGO,24,"[COG, COD, ZMB, NAM]",Angola,[por],"[-12.5, 18.5]",[Portuguese],[Angola]
3,91.0,AI,AIA,660,[],Anguilla,[eng],"[18.25, -63.16666666]",[English],[Anguilla]
4,1580.0,AX,ALA,248,[],Åland Islands,[swe],"[60.116667, 19.9]",[Swedish],[Åland]


## Dataset 2 : https://github.com/opendatajson/factbook.json

In this case we didn't have all the data in one file so the reading required more work. We created an empty dataframe and we run through all the folders and all the files inside them and appended the data to the dataframe, ignoring the index.

The dataset contains GEC code for the country names, so we will have to use another source to make the mapping of the GEC codes with the ISO codes.
From this dataset we will take the following features:
- gov_type
- GDP
- GDP per capita
- Population
- Interntet users
- Population in poverty
- Unemployment rate
- Religions

In [151]:
# Names of the folders
region_folders = ['africa', 'australia-oceania', 'central-america-n-caribbean', 'central-asia', 'east-n-southeast-asia',
          'europe', 'middle-east', 'north-america', 'south-america', 'south-asia']

# We use a temporaty df1 to load the data for a particular country
df1 = pd.DataFrame()

for region in region_folders:
    for country_file in os.listdir(r'factbook.json/' + region):
        df = pd.read_json(r'factbook.json/' + region + '/' + country_file)
        # Government type
        try:
            gov_type = df.loc['Government type', 'Government']['text']
        except:
            gov_type = 'unknown'
            
        # GDP
        try:
            gdp0 = df.loc['GDP (official exchange rate)', 'Economy']['text'].split()
            if gdp0[0] == '$NA':
                gdp = np.nan
            else:
                gdp = float(gdp0[0].replace('$', ''))
                if gdp0[1] == 'million':
                    gdp*=10**6
                elif gdp0[1] == 'billion':
                    gdp*=10**9
                elif gdp0[1] == 'trillion':
                    gdp*=10**12
        except:
            gdp = np.nan
            
        # GDP per capita
        try:
            gdp_capita = float(df.loc['GDP - per capita (PPP)', 'Economy']['text'].split(sep=' ')[0][1:].replace(',', ''))
        except:
            gdp_capita = np.nan
            
        # Population
        try:
            population = float(df.loc['Population', 'People and Society']['text'].split(sep=' ')[0].replace(',', ''))
        except:
            population = np.nan
           
        # Internet users
        try:    
            users0 = df.loc['Internet users', 'Communications']['total']['text'].split()
            users = float(users0[0].replace(',', ''))
            if len(users0) > 1:
                users = float(users)*10**6
        except:
            users = np.nan
            
        # Population in poverty
        try:
            pop_pov0 = df.loc['Population below poverty line', 'Economy']['text'].split()
            if pop_pov0[0] == 'NA%':
                pop_pov == np.nan
            else:
                pop_pov = float(pop_pov0[0].replace('%', ''))/100.
        except:
            pop_pov = np.nan
            
        # Unemployment rate
        try:
            UR0 = df.loc['Unemployment rate', 'Economy']['text'].split()
            if UR0[0] == 'NA%':
                unemployment_rate = np.nan
            else:
                unemployment_rate = float(UR0[0].replace('%', ''))/100.
        except:
            unemployment_rate = np.nan
            

        dictionary = {'gov_type': gov_type, 'gdp': gdp, 'gdp_capita': gdp_capita, 'POP': population, 
                      'Internet users': users, 'pop_pov': pop_pov, 'unemployment': unemployment_rate, 
                      'GEC_code': country_file[:2]}
        
        df1 = df1.append(dictionary, ignore_index=True)

In [152]:
df1.head()

Unnamed: 0,GEC_code,Internet users,POP,gdp,gdp_capita,gov_type,pop_pov,unemployment
0,ag,15105000.0,40263711.0,168300000000.0,15000.0,presidential republic,0.23,0.124
1,ao,2434000.0,20172332.0,91940000000.0,6800.0,presidential republic,0.405,
2,bc,600000.0,2209208.0,10950000000.0,16900.0,parliamentary republic,0.303,0.2
3,bn,709000.0,10741458.0,8930000000.0,2200.0,presidential republic,0.374,
4,by,523000.0,11099298.0,2742000000.0,800.0,presidential republic,0.68,


Now we have to extract religions. We first start by creating a list with all the unique religion names. The cell below generates that list

In [153]:
# array needed to generate the unique religions array
all_religions = []

for region in region_folders:
    for country_file in os.listdir(r'factbook.json/' + region):
        df = pd.read_json(r'factbook.json/' + region + '/' + country_file)
        
        try:
            rel0 = df.loc['Religions', 'People and Society']['text']
            religions0 = re.sub(r'\([^)]*\)', '', rel0,).split(',')
            
            pct_array = []            
            
            if '%' not in rel0:
                pass # prevents from adding mistaken religion names
            
            else:
                if country_file[:2] in ['mo', 'wa', 'ps', 'tv', 'cu', 'rs', 'cs', 'fr', 'it', 'mt', 'ba', 'gz', 'sa', 'sy', 'we']:
                    pass # prevents from adding mistaken religion names
                    
                else:
                    for rel_string in religions0:
                        rel_string= rel_string.replace('%','')
                        rel = ' '.join(rel_string.split()[:-1])
                        pct = float(rel_string.split()[-1].replace('<',''))/100.
                        all_religions.append(rel)

        except:
            pass #There is no data in the json file

# Generate unique religions array
unique_religions = np.unique(all_religions)

In [154]:
unique_religions[:20]

array(['Adventist', 'Animist', 'Armenian Apostolic', 'Assembly of God',
       'Awakening Churches/Christian Revival', 'Badimo', "Baha'i",
       'Baptist', 'Bektashi', 'Buddhism', 'Buddhist', 'Bukot nan Jesus',
       'Calvinist', 'Cao Dai', 'Catholic', 'Christian', 'Christianity',
       'Church of England', 'Church of Ireland', 'Church of Norway'], 
      dtype='<U40')

Now that we have all possible religions in one array, we will create the dataframe (rel_df), with all religions as columns, and the country as the index. The cell values will represent the percentage of the religion in the country.

In [155]:
# Creating the dataframe
rel_df = pd.DataFrame()

for region in region_folders:
    for country_file in os.listdir(r'factbook.json/' + region):
        df = pd.read_json(r'factbook.json/' + region + '/' + country_file)
        
        # Setting the dictionary that will contain every religion in each country
        religions = dict()
        try:
            rel0 = df.loc['Religions', 'People and Society']['text']
            religions0 = re.sub(r'\([^)]*\)', '', rel0,).split(',')         
            
            # If we don't know the percentages, we set it to 0
            if '%' not in rel0:
                religions.update({'unknown': 0.})

            else:
                # The format of these countries made unable to extract the religion. We set it to unknown: 0
                if country_file[:2] in ['mo', 'wa', 'ps', 'tv', 'cu', 'rs', 'cs', 'fr', 'it', 'mt', 'ba', 'gz', 'sa', 'sy', 'we']:
                    religions.update({'unknown': 0.})
                else:
                    for rel_string in religions0:
                        rel_string= rel_string.replace('%','')
                        rel = ' '.join(rel_string.split()[:-1])
                        pct = float(rel_string.split()[-1].replace('<',''))/100.
                        religions.update({rel: pct})
            
        except:
            religions.update({'unknown': 0.}) # If there is no data in the json file set it to 0
            
        # We have the dictionary with all religions in a country. Now we create the pct_array. For one country, loops through 
        # unique_religions and returns the array containing the pct of all the religions in that country. Then we append it to
        # the dataframe
        pct_array = []
        for unique_rel in unique_religions:
                if unique_rel in religions.keys():
                    pct_array.append(religions[unique_rel])
                else:
                    pct_array.append(0.)
                    

        dictionary = {'GEC_code': country_file[:2]}
        dictionary.update(dict(zip(unique_religions, pct_array)))

        rel_df = rel_df.append(dictionary, ignore_index=True)

In [156]:
rel_df.head()

Unnamed: 0,Adventist,Animist,Armenian Apostolic,Assembly of God,Awakening Churches/Christian Revival,Badimo,Baha'i,Baptist,Bektashi,Buddhism,...,unaffiliated,unaffiliated or other,unafilliated,undeclared or unknown,undeclared/no answer,unknown,unspecifed,unspecified,unspecified/no answer,voodoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.041,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079,0.0,0.0


In [157]:
# Dropping religions that don't have 0.02 in any country
print('Initial columns:', len(rel_df.columns))
for col in rel_df.columns:
    if col != 'GEC_code':
        if max(rel_df[col]) < 0.1:
            #print(col, len(rel_df.columns), max(rel_df[col]))
            rel_df.drop(col, axis=1, inplace=True)
            #print('')
            
print('After dropping:',len(rel_df.columns))

Initial columns: 152
After dropping: 66


In [158]:
# Merging the two dataframes on GEC_code
df2 = pd.merge(df1, rel_df, on='GEC_code')
df2.head()

Unnamed: 0,GEC_code,Internet users,POP,gdp,gdp_capita,gov_type,pop_pov,unemployment,Armenian Apostolic,Assembly of God,...,none,none or other,other,other Christian,other and unspecified,other or none,other or unspecified,unaffiliated,unaffiliated or other,unspecified
0,ag,15105000.0,40263711.0,168300000000.0,15000.0,presidential republic,0.23,0.124,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ao,2434000.0,20172332.0,91940000000.0,6800.0,presidential republic,0.405,,0.0,0.0,...,0.123,0.0,0.086,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bc,600000.0,2209208.0,10950000000.0,16900.0,parliamentary republic,0.303,0.2,0.0,0.0,...,0.152,0.0,0.014,0.0,0.0,0.0,0.0,0.0,0.0,0.003
3,bn,709000.0,10741458.0,8930000000.0,2200.0,presidential republic,0.374,,0.0,0.0,...,0.058,0.0,0.026,0.095,0.0,0.0,0.0,0.0,0.0,0.0
4,by,523000.0,11099298.0,2742000000.0,800.0,presidential republic,0.68,,0.0,0.0,...,0.0,0.0,0.036,0.0,0.0,0.0,0.0,0.0,0.0,0.079


As said, we need ISO codes instead of GEC codes so we scrape them from http://www.statoids.com/wab.html

In [159]:
r = requests.get('http://www.statoids.com/wab.html')
soup = BeautifulSoup(r.text, 'html.parser')

GEC_codes_df = pd.DataFrame()
for row in soup.find('table').find_all('tr')[1:-1]:
    col = row.find_all('td')
    GEC_codes_df = GEC_codes_df.append({'ISO2': col[1].text, 'ISO3': col[2].text, 'ISO_num': col[3].text, \
                                        'GEC_code': col[5].text.lower()}, ignore_index=True)

GEC_codes_df.head()

Unnamed: 0,GEC_code,ISO2,ISO3,ISO_num
0,af,AF,AFG,4
1,,AX,ALA,248
2,al,AL,ALB,8
3,ag,DZ,DZA,12
4,aq,AS,ASM,16


In [160]:
# Finally we merge on GEC codes
dataset_2_df = pd.merge(df2, GEC_codes_df, on='GEC_code', how='outer')
dataset_2_df.head()

Unnamed: 0,GEC_code,Internet users,POP,gdp,gdp_capita,gov_type,pop_pov,unemployment,Armenian Apostolic,Assembly of God,...,other Christian,other and unspecified,other or none,other or unspecified,unaffiliated,unaffiliated or other,unspecified,ISO2,ISO3,ISO_num
0,ag,15105000.0,40263711.0,168300000000.0,15000.0,presidential republic,0.23,0.124,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DZ,DZA,12
1,ao,2434000.0,20172332.0,91940000000.0,6800.0,presidential republic,0.405,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AO,AGO,24
2,bc,600000.0,2209208.0,10950000000.0,16900.0,parliamentary republic,0.303,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003,BW,BWA,72
3,bn,709000.0,10741458.0,8930000000.0,2200.0,presidential republic,0.374,,0.0,0.0,...,0.095,0.0,0.0,0.0,0.0,0.0,0.0,BJ,BEN,204
4,by,523000.0,11099298.0,2742000000.0,800.0,presidential republic,0.68,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.079,BI,BDI,108


### Generating pickle files

In [161]:
countries.to_pickle('countries_df.pickle')
dataset_2_df.to_pickle('dataset_2_df.pickle')

# Merging in one dataframe

In [162]:
# First, we drop the columns of each dataframe that we don't want to use
dataset_2_df.drop('ISO2', axis=1, inplace=True)

In [163]:
#rel_df.drop('GEC_code', axis=1, inplace=True)

In [164]:
dataset_2_df.loc[dataset_2_df.GEC_code == 'nu']

Unnamed: 0,GEC_code,Internet users,POP,gdp,gdp_capita,gov_type,pop_pov,unemployment,Armenian Apostolic,Assembly of God,...,other,other Christian,other and unspecified,other or none,other or unspecified,unaffiliated,unaffiliated or other,unspecified,ISO3,ISO_num
106,nu,1164000.0,5966798.0,13410000000.0,5300.0,presidential republic,0.296,0.06,0.0,0.0,...,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NIC,558


In [166]:
# We merge all the dataframes on IS0 Alpha-3 (ISO3) code
data = pd.merge(countries, dataset_2_df, on='ISO3', how='outer')

# Convert country names to string & select all rows where we actually have the country name
data.name = data.name.apply(str)
data = data.loc[~(data.name == 'nan')]
data.set_index('name', inplace=True)

# Selecting only useful columns
cols = ['area', 'ISO2', 'ISO3', 'languages', 'borders', 'latlng', 'language_codes', 'Internet users', 'POP', 
        'gdp', 'gdp_capita', 'gov_type', 'pop_pov', 'unemployment']
rel_cols = list(rel_df.columns)

data = data[cols + rel_cols]
data.head()

Unnamed: 0_level_0,area,ISO2,ISO3,languages,borders,latlng,language_codes,Internet users,POP,gdp,...,none,none or other,other,other Christian,other and unspecified,other or none,other or unspecified,unaffiliated,unaffiliated or other,unspecified
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,180.0,AW,ABW,"[Dutch, Papiamento]",[],"[12.5, -69.96666666]","[nld, pap]",99000.0,113648.0,2516000000.0,...,0.055,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.005
Afghanistan,652230.0,AF,AFG,"[Dari, Pashto, Turkmen]","[IRN, PAK, TKM, UZB, TJK, CHN]","[33, 65]","[prs, pus, tuk]",2690000.0,33332025.0,18400000000.0,...,0.0,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Angola,1246700.0,AO,AGO,[Portuguese],"[COG, COD, ZMB, NAM]","[-12.5, 18.5]",[por],2434000.0,20172332.0,91940000000.0,...,0.123,0.0,0.086,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Anguilla,91.0,AI,AIA,[English],[],"[18.25, -63.16666666]",[eng],12000.0,16752.0,175400000.0,...,0.045,0.0,0.032,0.109,0.0,0.0,0.0,0.0,0.0,0.003
Åland Islands,1580.0,AX,ALA,[Swedish],[],"[60.116667, 19.9]",[swe],,,,...,,,,,,,,,,


### Categorizing government type column

The last step is to make the government type column categorical. As we can see in the cell below, the column has a certain common format for each class of government, but it's still not useful. It needs some cleaning.  

In [168]:
data.gov_type.fillna('unknown', inplace=True)
for i in data.gov_type:
    if 'monarchy' in i:
        print(i)

parliamentary democracy (Parliament) under a constitutional monarchy; a Commonwealth realm
parliamentary democracy (Federal Parliament) under a constitutional monarchy; a Commonwealth realm
federal parliamentary democracy under a constitutional monarchy
constitutional monarchy
parliamentary democracy (Parliament) under a constitutional monarchy; a Commonwealth realm
parliamentary democracy (National Assembly) under a constitutional monarchy; a Commonwealth realm
parliamentary democracy (Parliament) under a constitutional monarchy; a Commonwealth realm
absolute monarchy or sultanate (locally known as Malay Islamic Monarchy)
constitutional monarchy
federal parliamentary democracy (Parliament of Canada) under a constitutional monarchy; a Commonwealth realm
parliamentary constitutional monarchy
parliamentary constitutional monarchy
parliamentary constitutional monarchy; a Commonwealth realm
parliamentary democracy (Parliament) under a constitutional monarchy; a Commonwealth realm
parliamen

### Methodology
We are going to run a function to all the rows of the gov_type column that will return the most common sequences of words, so that we can then manually check which are the main type of government.

The types of government considered where:
- parliamentary democracy
- parliamentary republic
- presidential republic
- semi-presidential republic
- presidential democracy    
- absolute monarchy
- parliamentary constitutional monarchy
- constitutional monarchy
- federal republic
- communist state
- monarchy
- others: federation of monarchies, non-self-governing overseas territory, theocratic republic, in transition, semi-presidential federation

Missing data is under the category 'unknown'.

In [169]:
def phrases(string):
    """Splits the input string on whitespace and returns all possible substrings of any length"""
    words = string.split()
    result = []
    for number in range(len(words)):
        for start in range(len(words)-number):
             result.append(" ".join(words[start:start+number+1]))
    return result

# Example
phrases('Hi my name is Jacob')

['Hi',
 'my',
 'name',
 'is',
 'Jacob',
 'Hi my',
 'my name',
 'name is',
 'is Jacob',
 'Hi my name',
 'my name is',
 'name is Jacob',
 'Hi my name is',
 'my name is Jacob',
 'Hi my name is Jacob']

In [170]:
all_strings = list(data.gov_type)

# Counts all ocurrences of a substring 
all_phrases = collections.Counter(phrase for subject in all_strings for phrase in phrases(subject))

# Printing the most common substrings and the number of occurences
ocurrences = [(phrase, count) for phrase, count in all_phrases.items() if count > 1]
ocurrences[:10]

[('parliamentary', 99),
 ('democracy', 41),
 ('part', 4),
 ('of', 59),
 ('the', 45),
 ('Kingdom', 4),
 ('Netherlands', 2),
 ('parliamentary democracy', 40),
 ('part of', 4),
 ('of the', 28)]

When looking at the government types, we noticed that most of them had could be expressed in two or three words so we filtered down to the substrings of these kind, to make the manual checking simpler.

In [171]:
# Filtering down to sequences of 2 or 3 words
filtered_ocurrences = [ocurrences[i][0] for i in range(len(ocurrences)) if 2 <= len(ocurrences[i][0].split()) <= 3]
filtered_ocurrences[:10]

['parliamentary democracy',
 'part of',
 'of the',
 'the Kingdom',
 'Kingdom of',
 'the Netherlands',
 'part of the',
 'of the Kingdom',
 'the Kingdom of',
 'Kingdom of the']

In [172]:
# Manually changing some countries data
data.loc['Tokelau', 'gov_type'] = 'parliamentary democracy'
data.loc['Vatican City', 'gov_type'] = 'monarchy'
data.loc['Hong Kong', 'gov_type'] = 'presidential democracy'
data.loc['Macau', 'gov_type'] = 'presidential democracy'

In [173]:
# Types of government
governments = ['parliamentary democracy', 'parliamentary republic', 'presidential democracy', 'presidential republic', 
               'semi-presidential republic', 'absolute monarchy', 'federal republic', 'communist state', 'parliamentary constitutional monarchy', 
               'constitutional monarchy', 'monarchy', 'unknown']

In [174]:
def categorize_government_types(df):
    for gov in governments:
        for country in df.index:
            if gov in df.loc[country, 'gov_type']:
                if (gov == 'presidential republic') & ('semi-presidential republic' in df.loc[country, 'gov_type']):
                    pass
                df.loc[country, 'gov_type'] = gov
    
categorize_government_types(data)

data = data.reset_index()
idx = [idx for idx, i in enumerate(data.gov_type) if i not in governments]
data.loc[idx, ['name', 'gov_type']]

Unnamed: 0,name,gov_type
1,Afghanistan,presidential Islamic republic
7,United Arab Emirates,federation of monarchies
39,Cocos (Keeling) Islands,non-self-governing overseas territory of Austr...
54,Christmas Island,non-self-governing overseas territory of Austr...
67,Western Sahara,legal status of territory and issue of soverei...
106,Iran,theocratic republic
127,Libya,in transition
189,Russia,semi-presidential federation
244,Yemen,in transition


In [175]:
# Manually changing Afghanistan (index 1) and Western Sahara (index 67)
data.loc[1, 'gov_type'] = 'presidential republic'
data.loc[67,'gov_type'] = 'unknown'

### Convert gov_type column to numeric

Reading information about all these types of government we grouped the different government types per similarity, and we computed a three value government type axis. To each group we assigned the values of -1, 0 and 1. The more positive the number is, the more power 

Here is the government type scale axis:

- Group 1:
    - 'parliamentary democracy'
    - 'parliamentary republic'
    - 'federal republic'
    - 'federation of monarchies'
    - 'semi-presidential republic'
    - 'semi-presidential federation'
- Group 2: 
    - 'non-self-governing overseas territory'
    - 'in transition'
    - 'unknown'
- Group 3: 
    - 'presidential republic'
    - 'presidential democracy'
    - 'monarchy'
    - 'theocratic republic'
    - 'communist state'
    - 'absolute monarchy'
 
As our criteria to group the government types, we used the variable of the power that the leaders of the government have. Coutries where the leader has a lot of power go in group 3, the others in group 1. For the non well defined government types, we placed them at the center of the scale.

In [176]:
gov_type_array = [['parliamentary democracy', 'parliamentary republic', 'federal republic', 'federation of monarchies', \
                  'semi-presidential republic', 'semi-presidential federation'], 
                  ['non-self-governing overseas territory', 'in transition', 'unknown'], 
                  ['presidential republic', 'presidential democracy', 'monarchy', 'theocratic republic', 'communist state',\
                  'absolute monarchy']]
mapping_gov_type=dict()
for i, gov_group in enumerate(gov_type_array):
    for gov in gov_group:
        if i==0:
            mapping_gov_type.update({gov: -1})
        elif i==1:
            mapping_gov_type.update({gov: 0})
        else:
            mapping_gov_type.update({gov: 1})

mapping_gov_type

{'absolute monarchy': 1,
 'communist state': 1,
 'federal republic': -1,
 'federation of monarchies': -1,
 'in transition': 0,
 'monarchy': 1,
 'non-self-governing overseas territory': 0,
 'parliamentary democracy': -1,
 'parliamentary republic': -1,
 'presidential democracy': 1,
 'presidential republic': 1,
 'semi-presidential federation': -1,
 'semi-presidential republic': -1,
 'theocratic republic': 1,
 'unknown': 0}

In [177]:
# Mapping gov_type values to their numerical value (1, 0, -1)
data['gov_type_num'] = data.gov_type.map(mapping_gov_type)
data[['gov_type', 'gov_type_num']].head()

Unnamed: 0,gov_type,gov_type_num
0,parliamentary democracy,-1.0
1,presidential republic,1.0
2,presidential republic,1.0
3,parliamentary democracy,-1.0
4,unknown,0.0


In [178]:
data.set_index('name', inplace=True)

### Grouping religions under broader categories

We grouped the religions in broader categories by adding the percentages.

In [179]:
# Here are the broader categories that we came up with
protestants = ['Calvinist', 'Church of Norway', 'Congregational Christian Church', 'Ekalesia Niue', 'Evangelical', 
               'Evangelical Lutheran', 'Evangelical Lutheran Church of Iceland', 'Evangelical or Protestant', 'Lutheran',
               'Protestant', 'Protestant and other', 'Seventh-Day Adventist', 'non-Catholic Christians', 'Armenian Apostolic', 
               'Assembly of God', 'Christian', 'Kimbanguist', 'Mormon', 'Zionist Christian', 'nondenominational', 
               'Awakening Churches/Christian Revival']
catholics = ['Catholic', 'Roman Catholic', 'nominally Roman Catholic']
ortodox = ['Eastern Orthodox', 'Ethiopian Orthodox', 'Greek Orthodox', 'Macedonian Orthodox', 'Orthodox', 'Orthodox Christian',
           'Russian Orthodox', 'Serbian Orthodox']
buddhism = ['Buddhism', 'Buddhist', 'Lamaistic Buddhist']
hindu = ['Hindu', 'Indian- and Nepalese-influenced Hinduism']
jewish = ['Jewish', 'Zionist', ]
muslim = ['Muslim', 'Sunni Muslim']
oriental = ['Shintoism', 'Taoist', 'mixture of Buddhist and Taoist']
other = ['Vodoun', 'eclectic mixture of local religions', 'folk religion', 'indigenous beliefs']
animist = ['animist', 'animist or no religion']
atheist = ['atheist or agnostic', 'no religion', 'non-believer/agnostic', 'non-believers']
unaffiliated = ['unaffiliated', 'unaffiliated or other']

# Those religions will be dropped
dropped_cols = ['Kempsville Presbyterian Church', 'none', 'none or other', 'other',
                'other Christian', 'other and unspecified', 'other or none', 'other or unspecified', 'unspecified']

# Arrays needed to run next cell
final_categories = [protestants, catholics, ortodox, buddhism, hindu, jewish, muslim, oriental, other, animist,
                   atheist, unaffiliated]
final_categories_names = ['protestants', 'catholics', 'ortodox', 'buddhism', 'hindu', 'jewish', 'muslim', 
                           'oriental', 'other', 'animist', 'atheist', 'unaffiliated']

In [180]:
def generate_categories_df(dataframe):
    df = pd.DataFrame()
    for category, category_name in zip(final_categories, final_categories_names):
        df[category_name] = dataframe[category].sum(axis=1)
    return df

categorized_df = generate_categories_df(data)
categorized_df.head()

Unnamed: 0_level_0,protestants,catholics,ortodox,buddhism,hindu,jewish,muslim,oriental,other,animist,atheist,unaffiliated
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aruba,0.049,0.753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.997,0.0,0.0,0.0,0.0,0.0
Angola,0.381,0.411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Anguilla,0.732,0.068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Åland Islands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [181]:
# We want to have hierarchical index in the columns and we need to generate a list of tuples with the names
religion_array = list()
for rel in categorized_df.columns:
    religion_array.append(('religion', rel))

# Assigning the hierarchical column names
categorized_df.columns = pd.MultiIndex.from_tuples(religion_array)

In [182]:
categorized_df.head(2)

Unnamed: 0_level_0,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion
Unnamed: 0_level_1,protestants,catholics,ortodox,buddhism,hindu,jewish,muslim,oriental,other,animist,atheist,unaffiliated
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Aruba,0.049,0.753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.997,0.0,0.0,0.0,0.0,0.0


Now the final step is to dropp the religion columns of the original dataframe and then add the new categorized dataframe.

In [183]:
def drop_uncategorized_religions(dataframe):
    df = dataframe.copy()
    # Dropping columns
    for cols in final_categories:
        df.drop(cols, axis=1, inplace=True)
    df.drop(dropped_cols, axis=1, inplace=True)
    return df


def make_hierarchical_index(dataframe):
    df = dataframe.copy()
    hierarchical_cols = list()
    for col_name in df.columns:
        hierarchical_cols.append((str(col_name), ''))
        
    # Assigning the hierarchical column names
    df.columns = pd.MultiIndex.from_tuples(hierarchical_cols)
    return df

data = make_hierarchical_index(drop_uncategorized_religions(data))
data.head()

Unnamed: 0_level_0,area,ISO2,ISO3,languages,borders,latlng,language_codes,Internet users,POP,gdp,gdp_capita,gov_type,pop_pov,unemployment,GEC_code,gov_type_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Aruba,180.0,AW,ABW,"[Dutch, Papiamento]",[],"[12.5, -69.96666666]","[nld, pap]",99000.0,113648.0,2516000000.0,25300.0,parliamentary democracy,,0.069,aa,-1.0
Afghanistan,652230.0,AF,AFG,"[Dari, Pashto, Turkmen]","[IRN, PAK, TKM, UZB, TJK, CHN]","[33, 65]","[prs, pus, tuk]",2690000.0,33332025.0,18400000000.0,2000.0,presidential republic,0.358,0.35,af,1.0
Angola,1246700.0,AO,AGO,[Portuguese],"[COG, COD, ZMB, NAM]","[-12.5, 18.5]",[por],2434000.0,20172332.0,91940000000.0,6800.0,presidential republic,0.405,,ao,1.0
Anguilla,91.0,AI,AIA,[English],[],"[18.25, -63.16666666]",[eng],12000.0,16752.0,175400000.0,12200.0,parliamentary democracy,0.23,0.08,av,-1.0
Åland Islands,1580.0,AX,ALA,[Swedish],[],"[60.116667, 19.9]",[swe],,,,,unknown,,,,0.0


In [184]:
def adding_categorized_religions(dataframe):
    df = dataframe.copy()
    df = pd.concat([df, categorized_df], axis=1)
    return df

data = adding_categorized_religions(data)

In [185]:
data.to_pickle('data.pickle')

In [186]:
cols = []
number = 0

for i, col in enumerate(data.columns):
    if i == number:
        print(i, ':', col)
    cols.append(col)

data.loc[:, [cols[number]]][data[cols[number]] > 1.]

0 : ('area', '')


Unnamed: 0_level_0,area
name,Unnamed: 1_level_1
Aruba,180.0
Afghanistan,652230.0
Angola,1246700.0
Anguilla,91.0
Åland Islands,1580.0
Albania,28748.0
Andorra,468.0
United Arab Emirates,83600.0
Argentina,2780400.0
Armenia,29743.0
