In [151]:
import pandas as pd
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
import collections

## Dataset 1: https://github.com/mledoze/countries

In [152]:
def get_language(df):
    """Takes the dataframe and, from the original column dict_of_languages, returns a dictionary where items are like 
       {country: list_of_language_names}"""
    languages = dict()
    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        languages.update({country: list(language_dict.values())})
    
    return languages

            
def get_language_codes(df):
    """Takes the dataframe and, from the original column dict_of_languages, returns a dictionary where items are like 
       {country: list_of_language_codes}"""
    
    language_codes = dict()
    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        if len(language_dict.keys()) > 0:
            language_codes.update({country: list(language_dict.keys())})
        else:
            language_codes.update({country: ['unknown']})
    
    return language_codes

def get_country_name_native(df):
    """Takes the dataframe and, from the original column dict_of_names, returns a dictionary where items are like 
       {country: list_of_native_country_names}. 
       The dict_of_names column conains the name of the country in various ways. This function only takes the common
       name of the country in the native language (i.e. we could also take the official name in the native language)"""
    
    native_names_dict = dict()
    for country, country_name_dict in zip(df.name, df.dict_of_names.values):
        native_country_names = []
        for native_lang in country_name_dict['native']:
            native_country_names.append(country_name_dict['native'][native_lang]['common'])
        native_names_dict.update({country: native_country_names})
    
    return native_names_dict
            

def add_columns(df):
    """Adds the following columns to the dataframe: name, name_native, languages and language_codes"""
    
    # Adding name & name_native columns
    df['name'] = [name['common'] for name in df.dict_of_names.values]
    country_native_names = get_country_name_native(df)
    df['name_native'] = list(country_native_names.values())
    
    # Getting the dictionaries needed to add the language columns
    languages = get_language(df)
    language_codes = get_language_codes(df)
    
    # Adding languages & language codes columns
    df['languages'] = list(languages.values())
    df['language_codes'] = list(language_codes.values())

    return df

In [153]:
# Read csv and renaming columns
countries = pd.read_json('countries.json')
countries.rename(columns={'name': 'dict_of_names', 'languages': 'dict_of_languages'}, inplace=True)

# Adding columns
countries = add_columns(countries)
# Select useful columns & renaming them
cols = ['area', 'cca2', 'cca3', 'ccn3', 'borders', 'name', 'language_codes', 'latlng', 'languages', 'name_native']
countries = countries[cols]
countries.rename(columns={'cca2': 'ISO2', 'cca3': 'ISO3', 'ccn3': 'ISO_num'}, inplace=True)

countries.head()

Unnamed: 0,area,ISO2,ISO3,ISO_num,borders,name,language_codes,latlng,languages,name_native
0,180.0,AW,ABW,533,[],Aruba,"[nld, pap]","[12.5, -69.96666666]","[Dutch, Papiamento]","[Aruba, Aruba]"
1,652230.0,AF,AFG,4,"[IRN, PAK, TKM, UZB, TJK, CHN]",Afghanistan,"[prs, pus, tuk]","[33, 65]","[Dari, Pashto, Turkmen]","[افغانستان, افغانستان, Owganystan]"
2,1246700.0,AO,AGO,24,"[COG, COD, ZMB, NAM]",Angola,[por],"[-12.5, 18.5]",[Portuguese],[Angola]
3,91.0,AI,AIA,660,[],Anguilla,[eng],"[18.25, -63.16666666]",[English],[Anguilla]
4,1580.0,AX,ALA,248,[],Åland Islands,[swe],"[60.116667, 19.9]",[Swedish],[Åland]


## Dataset 2 : http://www.thearda.com/Archive/Files/Downloads/WRDNATL_DL2.asp

The file contains all religions as columns, both totals and percentages. We only consider that a country has a certain religion if that religion has more than a treshold of 10% of adherents out of all the population. For that, we are only interested in the columns with percentages, which have 'PCT' in the name.

More cleaning of the columns will be done after merging.

In [154]:
# Reading excel and selecting most recent data
pop_rel_df = pd.read_excel('World Religion Dataset - National Religion Dataset.xlsx')
pop_rel_df = pop_rel_df.loc[pop_rel_df['YEAR'] == '2010']

# Selecing useful columns
cols = ['ISO3', 'COUNTRY', 'POP', 'DUALREL'] + \
        [col for col in pop_rel_df.columns if 'PCT' in col]
pop_rel_df = pop_rel_df[cols]

# Renaming and reseting index
pop_rel_df.rename(columns={'NUMISO': 'ISO_num'}, inplace=True)
pop_rel_df.reset_index(drop=True, inplace=True)

pop_rel_df.head()

Unnamed: 0,ISO3,COUNTRY,POP,DUALREL,CHPRTPCT,CHCATPCT,CHORTPCT,CHANGPCT,CHOTHPCT,CHGENPCT,...,SHGENPCT,BAGENPCT,TAGENPCT,JAGENPCT,COGENPCT,SYGENPCT,ANGENPCT,NORELPCT,OTGENPCT,SUMPCT
0,USA,United States of America,312750000,0,0.3829,0.2507,0.022499,0.015499,0.0738,0.7454,...,0.0005,0.0015,0.0,0.0003,0.0003,0.002599,0.005699,0.19,0.0025,0.9975
1,CAN,Canada,34500000,0,0.2298,0.4202,0.022799,0.078899,0.014399,0.7661,...,0.0,0.0005,9.9e-05,9.9e-05,9.9e-05,0.0008,0.0021,0.1643,0.001,0.999
2,BHS,Bahamas,313312,0,0.676,0.14,0.0,0.15,0.0,0.966,...,0.0,0.0,0.0003,0.0,0.0,0.0,0.0032,0.028999,0.0005,0.9995
3,CUB,Cuba,11241161,1,0.048899,0.6,0.0,0.0,0.009999,0.6589,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.1315,0.0,1.2935
4,HTI,Haiti,9760832,1,0.1,0.72,0.0,0.0,0.0,0.82,...,0.0,0.0009,0.0,0.0,0.0,0.45,0.0,0.1,0.0,1.3711


## Dataset 3
Total: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD

Per capita: https://data.worldbank.org/indicator/NY.GDP.PCAP.CD

We start by reading the two files into two different datagrames and then we merge them.

In [155]:
gdp_total = pd.read_csv('gdp_total.csv', skiprows=3)[['Country Name', 'Country Code', '2016']]
gdp_total.rename(columns={'2016': '2016_gdp_total', 'Country Code': 'ISO3'}, inplace=True)

In [156]:
gdp_capita = pd.read_csv('gdp_per_capita.csv', skiprows=4)[['Country Name', 'Country Code', '2016']]
gdp_capita.rename(columns={'2016': '2016_gdp_capita', 'Country Code': 'ISO3'}, inplace=True)

In [157]:
gdp_df = pd.merge(gdp_total, gdp_capita, on=['ISO3', 'Country Name'])
gdp_df.head()

Unnamed: 0,Country Name,ISO3,2016_gdp_total,2016_gdp_capita
0,Aruba,ABW,,
1,Afghanistan,AFG,19469020000.0,561.778746
2,Angola,AGO,89633160000.0,3110.808183
3,Albania,ALB,11926890000.0,4146.89625
4,Andorra,AND,,


## Dataset 4 : https://github.com/opendatajson/factbook.json

In this case we didn't have all the data in one file so the reading required more work. We created an empty dataframe and we run through all the folders and all the files inside them and appended the data to the dataframe, ignoring the index.

The dataset contains GEC code for the country names, so we will have to use another source to make the mapping of the GEC codes with the ISO codes.

In [158]:
# Names of the folders
region_folders = ['africa', 'australia-oceania', 'central-america-n-caribbean', 'central-asia', 'east-n-southeast-asia',
          'europe', 'middle-east', 'north-america', 'south-america', 'south-asia']

# We use a temporaty df to load the data for a particular country and we append it to the main GEC_gov_type_df
GEC_gov_type_df = pd.DataFrame()
for region in region_folders:
    for country_file in os.listdir(r'factbook.json/' + region):
        df = pd.read_json(r'factbook.json/' + region + '/' + country_file)
        try:
            gov_type = df.loc['Government type', 'Government']['text']
        except:
            gov_type = 'unknown'
        
        GEC_gov_type_df = GEC_gov_type_df.append({'GEC_code': country_file[:2], 'gov_type': gov_type}, ignore_index=True)
    
GEC_gov_type_df.head()

Unnamed: 0,GEC_code,gov_type
0,ag,presidential republic
1,ao,presidential republic
2,bc,parliamentary republic
3,bn,presidential republic
4,by,presidential republic


As said, we need ISO codes instead of GEC codes so we scrape them from http://www.statoids.com/wab.html

In [159]:
r = requests.get('http://www.statoids.com/wab.html')
soup = BeautifulSoup(r.text, 'html.parser')

GEC_codes_df = pd.DataFrame()
for row in soup.find('table').find_all('tr')[1:-1]:
    col = row.find_all('td')
    GEC_codes_df = GEC_codes_df.append({'ISO2': col[1].text, 'ISO3': col[2].text, 'ISO_num': col[3].text, \
                                        'GEC_code': col[5].text.lower()}, ignore_index=True)

GEC_codes_df.head()

Unnamed: 0,GEC_code,ISO2,ISO3,ISO_num
0,af,AF,AFG,4
1,,AX,ALA,248
2,al,AL,ALB,8
3,ag,DZ,DZA,12
4,aq,AS,ASM,16


In [160]:
# Finally we merge on GEC codes
gov_type_df = pd.merge(GEC_gov_type_df, GEC_codes_df, on='GEC_code', how='outer')
gov_type_df.fillna('unknown', inplace=True)
gov_type_df.head()

Unnamed: 0,GEC_code,gov_type,ISO2,ISO3,ISO_num
0,ag,presidential republic,DZ,DZA,12
1,ao,presidential republic,AO,AGO,24
2,bc,parliamentary republic,BW,BWA,72
3,bn,presidential republic,BJ,BEN,204
4,by,presidential republic,BI,BDI,108


### Generating pickle files

In [161]:
countries.to_pickle('countries_df.pickle')
pop_rel_df.to_pickle('pop_rel_df.pickle')
gdp_df.to_pickle('gdp_df.pickle')
gov_type_df.to_pickle('gov_type_df.pickle')

# Merging in one dataframe

In [162]:
# First, we drop the columns of each dataframe that we don't want to use
pop_rel_df.drop('COUNTRY', axis=1, inplace=True)
gdp_df.drop('Country Name', axis=1, inplace=True)#
gov_type_df.drop(['ISO2', 'GEC_code'], axis=1, inplace=True)

In [163]:
# We merge all the dataframes on IS0 Alpha-3 (ISO3) code, one by one.
dataframes = [countries, pop_rel_df, gdp_df, gov_type_df]
data = countries.copy()
for df in dataframes[1:]:
    data = pd.merge(data, df, on='ISO3', how='outer')

# Convert country names to string & select all rows where we actually have the country name
data.name = data.name.apply(str)
data = data.loc[~(data.name == 'nan')]
data.set_index('name', inplace=True)
data.head()

Unnamed: 0_level_0,area,ISO2,ISO3,ISO_num_x,borders,language_codes,latlng,languages,name_native,POP,...,COGENPCT,SYGENPCT,ANGENPCT,NORELPCT,OTGENPCT,SUMPCT,2016_gdp_total,2016_gdp_capita,gov_type,ISO_num_y
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,180.0,AW,ABW,533,[],"[nld, pap]","[12.5, -69.96666666]","[Dutch, Papiamento]","[Aruba, Aruba]",,...,,,,,,,,,parliamentary democracy (Legislature); part of...,533
Afghanistan,652230.0,AF,AFG,4,"[IRN, PAK, TKM, UZB, TJK, CHN]","[prs, pus, tuk]","[33, 65]","[Dari, Pashto, Turkmen]","[افغانستان, افغانستان, Owganystan]",27000000.0,...,0.0,0.0,9.9e-05,0.002,0.0014,0.9986,19469020000.0,561.778746,presidential Islamic republic,4
Angola,1246700.0,AO,AGO,24,"[COG, COD, ZMB, NAM]",[por],"[-12.5, 18.5]",[Portuguese],[Angola],19114176.0,...,0.0,0.0,0.075899,0.017899,0.0044,0.9956,89633160000.0,3110.808183,presidential republic,24
Anguilla,91.0,AI,AIA,660,[],[eng],"[18.25, -63.16666666]",[English],[Anguilla],,...,,,,,,,,,parliamentary democracy (House of Assembly); s...,660
Åland Islands,1580.0,AX,ALA,248,[],[swe],"[60.116667, 19.9]",[Swedish],[Åland],,...,,,,,,,,,unknown,248


### Formatting religion column

As we can see, in the dataframe we still have the annoying religion columns. We would like to compress all the information of the religion in one single column.

We are only interested in the main religions, so we set a treshold of 10% of adherents. To consider that a country has a certain religion, the percentage of adherents has to be superior to 10%. Obviously, one country can have more than one religion. 

Empty dictionary values in the religion column mean that we didn't have any data on the religion percentages.

In [164]:
def get_main_religions(df):
    """This function takes all religion columns and returns the dictionary needed to create the single religion column.
    The items of the dictionary are like the following {country: {religion1: percentage1, ... , religionN: percentageN}}"""
    treshold = 0.1
    religions_dict = dict()
    for country in data.index:
        religions = {}
        for religion in religion_cols:
            pct = data.loc[country, religion]
            if pct >= treshold:
                religions.update({religion: pct})
        religions_dict.update({country: religions})

    return religions_dict

In [165]:
# Renaming religion columns
new_names = {'CHGENPCT': 'Christianism', 'JDGENPCT': 'Judaism', 'ISGENPCT': 'Islam', 'BUGENPCT': 'Buddhism',
             'ZOGENPCT': 'Zoroastrian', 'HIGENPCT': 'Hindu', 'SIGENPCT': 'Sikh', 'SHGENPCT': 'Shinto', 
             'BAGENPCT': "Baha'i", 'TAGENPCT': 'Taoism', 'JAGENPCT': 'Jain', 'COGENPCT': 'Confucianism', 
             'SYGENPCT': 'Syncretic religions', 'ANGENPCT': 'Animist religions', 'NORELPCT': 'Non-religious', 
             'OTGENPCT': 'Other religions'}

religion_cols = new_names.values()
data.rename(columns=new_names, inplace=True)

# Adding the column to the dataframe
data['religion'] = get_main_religions(data).values()

# Selecting only useful columns
cols = ['area', 'ISO2', 'ISO3', 'languages', 'borders', 'latlng', 'language_codes',  'POP', 'religion', '2016_gdp_total', '2016_gdp_capita', 'gov_type']
data = data[cols]
data.head()

Unnamed: 0_level_0,area,ISO2,ISO3,languages,borders,latlng,language_codes,POP,religion,2016_gdp_total,2016_gdp_capita,gov_type
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aruba,180.0,AW,ABW,"[Dutch, Papiamento]",[],"[12.5, -69.96666666]","[nld, pap]",,{},,,parliamentary democracy (Legislature); part of...
Afghanistan,652230.0,AF,AFG,"[Dari, Pashto, Turkmen]","[IRN, PAK, TKM, UZB, TJK, CHN]","[33, 65]","[prs, pus, tuk]",27000000.0,{'Islam': 0.9956},19469020000.0,561.778746,presidential Islamic republic
Angola,1246700.0,AO,AGO,[Portuguese],"[COG, COD, ZMB, NAM]","[-12.5, 18.5]",[por],19114176.0,{'Christianism': 0.8912},89633160000.0,3110.808183,presidential republic
Anguilla,91.0,AI,AIA,[English],[],"[18.25, -63.16666666]",[eng],,{},,,parliamentary democracy (House of Assembly); s...
Åland Islands,1580.0,AX,ALA,[Swedish],[],"[60.116667, 19.9]",[swe],,{},,,unknown


### Categorizing government type column

The last step is to make the government type column categorical. As we can see in the cell below, the column has a certain common format for each class of government, but it's still not useful. It needs some cleaning.  

In [166]:
data.gov_type.fillna('unknown', inplace=True)
data.gov_type

name
Aruba                                   parliamentary democracy (Legislature); part of...
Afghanistan                                                 presidential Islamic republic
Angola                                                              presidential republic
Anguilla                                parliamentary democracy (House of Assembly); s...
Åland Islands                                                                     unknown
Albania                                                            parliamentary republic
Andorra                                 parliamentary democracy (since March 1993) tha...
United Arab Emirates                                             federation of monarchies
Argentina                                                           presidential republic
Armenia                                                        semi-presidential republic
American Samoa                          presidential democracy; a self-governing terri...
Antar

### Methodology
We are going to run a function to all the rows of the gov_type column that will return the most common sequences of words, so that we can then manually check which are the main type of government.

The types of government considered where:
- parliamentary democracy
- parliamentary republic
- presidential republic
- semi-presidential republic
- presidential democracy    
- absolute monarchy
- federal republic
- communist state
- monarchy
- others

Missing data is under the category 'unknown'.

In [167]:
def phrases(string):
    """Splits the input string on whitespace and returns all possible substrings of any length"""
    words = string.split()
    result = []
    for number in range(len(words)):
        for start in range(len(words)-number):
             result.append(" ".join(words[start:start+number+1]))
    return result

# Example
phrases('Hi my name is Jacob')

['Hi',
 'my',
 'name',
 'is',
 'Jacob',
 'Hi my',
 'my name',
 'name is',
 'is Jacob',
 'Hi my name',
 'my name is',
 'name is Jacob',
 'Hi my name is',
 'my name is Jacob',
 'Hi my name is Jacob']

In [168]:
all_strings = list(data.gov_type)

# Counts all ocurrences of a substring 
all_phrases = collections.Counter(phrase for subject in all_strings for phrase in phrases(subject))

# Printing the most common substrings and the number of occurences
ocurrences = [(phrase, count) for phrase, count in all_phrases.items() if count > 1]
ocurrences[:10]

[('parliamentary', 99),
 ('democracy', 41),
 ('part', 4),
 ('of', 59),
 ('the', 45),
 ('Kingdom', 4),
 ('Netherlands', 2),
 ('parliamentary democracy', 40),
 ('part of', 4),
 ('of the', 28)]

When looking at the government types, we noticed that most of them had could be expressed in two or three words so we filtered down to the substrings of these kind, to make the manual checking simpler.

In [169]:
# Filtering down to sequences of 2 or 3 words
filtered_ocurrences = [ocurrences[i][0] for i in range(len(ocurrences)) if 2 <= len(ocurrences[i][0].split()) <= 3]
filtered_ocurrences[:10]

['parliamentary democracy',
 'part of',
 'of the',
 'the Kingdom',
 'Kingdom of',
 'the Netherlands',
 'part of the',
 'of the Kingdom',
 'the Kingdom of',
 'Kingdom of the']

In [170]:
# Manually changing some countries data
data.loc['Tokelau', 'gov_type'] = 'parliamentary democracy'
data.loc['Vatican City', 'gov_type'] = 'monarchy'
data.loc['Hong Kong', 'gov_type'] = 'presidential democracy'
data.loc['Macau', 'gov_type'] = 'presidential democracy'

In [171]:
# Types of government
governments = ['parliamentary democracy', 'parliamentary republic', 'presidential democracy', 'presidential republic', 
               'semi-presidential republic', 'absolute monarchy', 'federal republic', 'communist state', 
               'monarchy', 'unknown']

In [172]:
def categorize_government_types():
    for gov in governments:
        for country in data.index:
            if gov in data.loc[country, 'gov_type']:
                if (gov == 'presidential republic') & ('semi-presidential republic' in data.loc[country, 'gov_type']):
                    pass
                data.loc[country, 'gov_type'] = gov
    
categorize_government_types()

idx = [idx for idx, i in enumerate(data.gov_type) if i not in governments]
data.reset_index(inplace=True)
data.loc[idx, 'gov_type'] = 'other'
data.set_index('name', inplace=True)

In [173]:
data['Active tweeter users'] = 'TBD'

In [174]:
def replace_commas_in_integer(df):
    df.POP = df.POP.apply(str)
    df.POP = df.POP.apply(str.replace, args=(',', ''))
    df.POP = df.POP.apply(float)
    return df

def convert_to_numeric(df):
    for column in df.columns:
        try:
            df[column] = pd.to_numeric(df[column])
        except:
            pass
    return df

data = replace_commas_in_integer(data)
data = convert_to_numeric(data)

In [175]:
data.to_pickle('data.pickle')

In [184]:
data

Unnamed: 0_level_0,area,ISO2,ISO3,languages,borders,latlng,language_codes,POP,religion,2016_gdp_total,2016_gdp_capita,gov_type,Active tweeter users
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Aruba,180.00,AW,ABW,"[Dutch, Papiamento]",[],"[12.5, -69.96666666]","[nld, pap]",,{},,,parliamentary democracy,TBD
Afghanistan,652230.00,AF,AFG,"[Dari, Pashto, Turkmen]","[IRN, PAK, TKM, UZB, TJK, CHN]","[33, 65]","[prs, pus, tuk]",27000000.0,{'Islam': 0.9956},1.946902e+10,561.778746,other,TBD
Angola,1246700.00,AO,AGO,[Portuguese],"[COG, COD, ZMB, NAM]","[-12.5, 18.5]",[por],19114176.0,{'Christianism': 0.8912},8.963316e+10,3110.808183,presidential republic,TBD
Anguilla,91.00,AI,AIA,[English],[],"[18.25, -63.16666666]",[eng],,{},,,parliamentary democracy,TBD
Åland Islands,1580.00,AX,ALA,[Swedish],[],"[60.116667, 19.9]",[swe],,{},,,unknown,TBD
Albania,28748.00,AL,ALB,[Albanian],"[MNE, GRC, MKD, UNK]","[41, 20]",[sqi],3195525.0,"{'Christianism': 0.2144, 'Islam': 0.63, 'Non-r...",1.192689e+10,4146.896250,parliamentary republic,TBD
Andorra,468.00,AD,AND,[Catalan],"[FRA, ESP]","[42.5, 1.5]",[cat],85500.0,{'Christianism': 0.907},,,parliamentary democracy,TBD
United Arab Emirates,83600.00,AE,ARE,[Arabic],"[OMN, SAU]","[24, 54]",[ara],6236650.0,"{'Islam': 0.6748, 'Hindu': 0.2225}",3.487433e+11,37622.207458,other,TBD
Argentina,2780400.00,AR,ARG,"[Guaraní, Spanish]","[BOL, BRA, CHL, PRY, URY]","[-34, -64]","[grn, spa]",40399992.0,"{'Christianism': 0.8515, 'Non-religious': 0.12}",5.458662e+11,12449.216852,presidential republic,TBD
Armenia,29743.00,AM,ARM,"[Armenian, Russian]","[AZE, GEO, IRN, TUR]","[40, 45]","[hye, rus]",3245781.0,{'Christianism': 0.951},1.054733e+10,3606.152057,presidential republic,TBD
