In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import scipy.sparse.linalg
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In this notebook we are going to compute the dataframes containing the distances between countries regarding both religion and government type.

For the religion distances we opted for computing the dot product of each pair of countries with an euclidean metric, while for the government we used the numeric government type and computed the absolute distance between the different government types of each pair of countries.

###  Religion distances

In [2]:
# Reading the dataframe and selecting religion columns
data = pd.read_pickle(os.path.join(os.getcwd(), os.pardir,'DataEnriching','data.pickle'))
data.set_index('ISO2', inplace=True)
cols = data.columns.tolist()[16:]
rel_data = data[cols]
rel_data.head()

Unnamed: 0_level_0,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion
Unnamed: 0_level_1,christianity,buddhism,hindu,jewish,muslim,oriental,other,animist,atheist,unaffiliated
ISO2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AW,0.802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AF,0.0,0.0,0.0,0.0,0.997,0.0,0.0,0.0,0.0,0.0
AO,0.792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AI,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AX,,,,,,,,,,


When computing the euclidean distance, the distance between countries with no data will be 0, meaning that for the visualization they will seem to be similar regarding religion. This makes no sense because we cannot assume that all the countries with missing data will be similar in terms of religion. Instead, it makes more sense to set them as separate from the rest of the countries. In order to get that these countries will be distant to every other country, we will set the percentages to infinite, so that the euclidean distance to every other country will be infinite.

In [3]:
# Select all null rows and replace values with infinite
rel_data.loc[(rel_data == 0).all(axis=1)] = rel_data.loc[(rel_data == 0).all(axis=1)].replace(0.0, np.inf)
rel_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,religion,religion,religion,religion,religion,religion,religion,religion,religion,religion
Unnamed: 0_level_1,christianity,buddhism,hindu,jewish,muslim,oriental,other,animist,atheist,unaffiliated
ISO2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AW,0.802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AF,0.0,0.0,0.0,0.0,0.997,0.0,0.0,0.0,0.0,0.0
AO,0.792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AI,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AX,,,,,,,,,,


In [4]:
# Computing the religion distance between each pair of countries
rel_distances = spatial.distance.squareform(spatial.distance.pdist(rel_data,'euclidean'))

In [5]:
# Converting into a dataframe & displaying distances
rel_distance_df = pd.DataFrame(rel_distances, index=rel_data.index.tolist(), columns=rel_data.index.tolist())
rel_distance_df.fillna(np.inf, inplace=True)
rel_distance_df.head()

Unnamed: 0,AW,AF,AO,AI,AX,AL,AD,AE,AR,AM,...,VG,VI,VN,VU,WF,WS,YE,ZA,ZM,ZW
AW,0.0,1.279536,0.01,0.002,inf,0.850556,inf,1.041414,0.139442,0.134,...,0.021954,0.128,0.73128,0.022,0.188,0.118,1.274867,0.365308,0.153,0.041
AF,1.279536,0.0,1.273292,1.278284,inf,0.461654,inf,0.253513,1.370405,1.367518,...,1.272812,1.363418,1.001939,1.293439,1.40503,1.356617,0.006,1.074846,1.380592,1.305626
AO,0.01,1.273292,0.0,0.008,inf,0.843128,inf,1.034603,0.149345,0.144,...,0.019026,0.138,0.72134,0.032,0.198,0.128,1.2686,0.355317,0.163,0.051
AI,0.002,1.278284,0.008,0.0,inf,0.849066,inf,1.040048,0.141421,0.136,...,0.021024,0.13,0.729292,0.024,0.19,0.12,1.273609,0.36331,0.155,0.043
AX,inf,inf,inf,inf,0.0,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf


We can see that the distance between one country and itself is zero, and that the countries that didn't have data will have an infinite distance to every other country.

### Government type distances

In [6]:
# Loading dataframe
data = pd.read_pickle(os.path.join(os.getcwd(), os.pardir,'DataEnriching','data.pickle'))
data.reset_index(inplace=True)
data.set_index('ISO2', inplace=True)

In [7]:
# Keeping the only important column (numeric government type)
gov_type_df = data[['gov_type_num']]

# Creating the dataframe that will contain the distances
gov_distance_df = pd.DataFrame(columns=gov_type_df.index.tolist())

# Computing the distances for each country and appending the country row to the dataframe above
for country1, value1 in zip(gov_type_df.index.tolist(), gov_type_df['gov_type_num']):
    row = []
    for country1, value2 in zip(gov_type_df.index.tolist(), gov_type_df['gov_type_num']):
        row.append(abs(value1-value2))
    
    
    dictionary = dict(zip(data.index.tolist(), row))
    gov_distance_df = gov_distance_df.append(dictionary, ignore_index=True)
    

# Setting index and display result
gov_distance_df.index = gov_type_df.index
gov_distance_df.head(10)

Unnamed: 0_level_0,AW,AF,AO,AI,AX,AL,AD,AE,AR,AM,...,VG,VI,VN,VU,WF,WS,YE,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AW,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
AF,2.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,...,2.0,0.0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0
AO,2.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,...,2.0,0.0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0
AI,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
AX,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
AL,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
AD,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
AE,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
AR,2.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,...,2.0,0.0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0
AM,2.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,...,2.0,0.0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0


Again, the government distance between the same country is zero as expected. In this case we didn't have any problems because we didn't have missing data.

In [8]:
# Pickling the resulting dataframes
rel_distance_df.to_pickle('rel_distance_df.pickle')
gov_distance_df.to_pickle('gov_distance_df.pickle')