# My Capstone Project
## Visual correlation between religious distribution and cultural venues

### First load all libraries and grab the data for religious distribution from wikipedia


In [14]:
import pandas as pd
import numpy as np
import textdistance
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import requests
import folium
from folium.features import DivIcon
import matplotlib.cm as cm
import matplotlib.colors as colors
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Read all tables from the wikipedia page
df = pd.read_html('https://en.wikipedia.org/wiki/Religions_by_country')

# Select all the tables containing countries
list_countryTables = []
for i in range(len(df)):
    if df[i].columns[0] == ('Country[1]', 'Country[1]'):
        list_countryTables.append(i)

# Select only the columns containing %
full = pd.concat(np.array(df)[np.array(list_countryTables)], ignore_index=True)
cols = full.columns.get_level_values(1)
cols_perc = cols.str.contains("%")
cols_perc[0] = True
cols_perc[1] = True

full = full.loc[:, cols_perc]

# Remove countries and change names for better comparison with capitals
drop_entries = ["Total", "Asia", "Latin America and the Caribbean", "Sub-Saharan Africa", "Western Africa", 
                "Southern Africa", "North Africa", "Central Africa", "Middle East and North Africa", "Eastern Africa", 
                "Mayotte", "Réunion", "Eswatini", "Ivory Coast", "Guadeloupe", "Martinique", "Netherlands Antilles",
                "French Guiana", "Brunei", "Micronesia", "Europe"]
drop_rows =  [i in drop_entries for i in full.iloc[:, 0]]
full = full.drop(full.index[drop_rows]).reset_index(drop=True)

full.replace(None, "China", inplace=True, regex="Mainland China")

full.columns = ["CountryName", "Population", "Christian", "Muslim", "Irreligion", "Hindu", 
                "Buddhist", "Folk", "Other", "Jewish"]

print(full.head())

                         CountryName Population Christian Muslim Irreligion  \
0                             Angola   19080000      90.5    0.2        5.1   
1                           Cameroon   19600000      70.3   18.3        5.3   
2           Central African Republic    4400000      89.5    8.5          1   
3                               Chad   11230000      40.6   55.3        2.5   
4  Congo, Democratic Republic of the   65970000      95.8    1.5        1.8   

  Hindu Buddhist Folk Other Jewish  
0     0        0  4.2     0      0  
1     0        0  3.3   2.7      0  
2     0        0    1     0      0  
3     0        0  1.4   0.1      0  
4  0.05        0  0.7   0.1      0  


### Then load the csv file with capitals and their locations for each country

In [2]:
# Some entries are empty or contain invalid signs
full_valid = full.dropna().reset_index(drop=True)
full_valid.replace(None, "", inplace=True, regex="\[.*\]")
full_valid.replace(None, "", inplace=True, regex=",")
full_valid.replace(None, "", inplace=True, regex="%")
full_valid.replace(None, "", inplace=True, regex="< *")

# Read in the capitals of each country and the according coordinates
capitals = pd.read_csv("country-capitals.csv", sep=",")

# Match the country names from wiki with the ones from capital table
for i in range(len(full_valid)):
    dist_min = 999999
    for j in range(len(capitals)):
        dist = textdistance.levenshtein(full_valid.iloc[i, 0], capitals.iloc[j, 0])
        if dist < dist_min:
            dist_min = dist
            pos = j
    if dist_min>0:
        capitals.iloc[pos, 0] = full_valid.iloc[i, 0]

# Merge both tables
full_valid = pd.merge(full_valid, capitals, on='CountryName')
full_valid = full_valid.dropna().reset_index(drop=True)

print(full_valid.head())


                CountryName Population Christian Muslim Irreligion Hindu  \
0                    Angola   19080000      90.5    0.2        5.1     0   
1                  Cameroon   19600000      70.3   18.3        5.3     0   
2  Central African Republic    4400000      89.5    8.5          1     0   
3                      Chad   11230000      40.6   55.3        2.5     0   
4     Congo Republic of the    4040000      85.9    1.2          9     0   

  Buddhist Folk Other Jewish CapitalName  CapitalLatitude  CapitalLongitude  \
0        0  4.2     0      0      Luanda        -8.833333         13.216667   
1        0  3.3   2.7      0     Yaounde         3.866667         11.516667   
2        0    1     0      0      Bangui         4.366667         18.583333   
3        0  1.4   0.1      0   N'Djamena        12.100000         15.033333   
4        0  2.8   1.1      0    Kinshasa        -4.316667         15.300000   

  CountryCode ContinentName  
0          AO        Africa  
1       

### Draw folium maps for each major religion

In [37]:
# Calculate maximum population for determining circle marker sizes
max_pop = full_valid.iloc[:,1].astype("int").max()

# Draw a map for a selected column with a selected color
def drawMap(map_to_draw, column, colo):
    for i in range(len(full_valid)):
        label = '{}'.format(full_valid.iloc[i, 0])
        label = folium.Popup(label, parse_html=True)
        value = full_valid.loc[i, column]
        if float(value)>0:
            folium.CircleMarker(
                [float(full_valid.loc[i, "CapitalLatitude"]), float(full_valid.loc[i, "CapitalLongitude"])],
                radius=np.sqrt(float(full_valid.loc[i, 'Population'])*float(full_valid.loc[i, column])/100)/np.sqrt(max_pop)*25.+0.001,
                popup=label,
                color=colo,
                fill=True,
                fill_color=colo,
                opacity=0.9,
                fill_opacity=0.8,
                weight=0,
            ).add_to(map_to_draw)
    folium.map.Marker(
    [70, -70],
    icon=DivIcon(
        icon_size=(150,36),
        icon_anchor=(0,0),
        html=f'<div style="font-size: 24pt">{column}</div>',
        )
    ).add_to(map_to_draw)

    
map_christianity = folium.Map(location=[15, 25], zoom_start=2)
drawMap(map_christianity, "Christian", colors.rgb2hex([1., 0., 0., 1.]))
map_christianity

In [75]:
map_muslim = folium.Map(location=[15, 25], zoom_start=2)
drawMap(map_muslim, "Muslim", colors.rgb2hex([0, 1, 0., 1.]))
map_muslim

In [39]:
map_atheist = folium.Map(location=[15, 25], zoom_start=2)
drawMap(map_atheist, "Irreligion", colors.rgb2hex([0, 0, 1, 1.]))
map_atheist

In [40]:
map_hindu = folium.Map(location=[15, 25], zoom_start=2)
drawMap(map_hindu, "Hindu", colors.rgb2hex([0, 1., 1, 1.]))
map_hindu

In [41]:
map_buddhist = folium.Map(location=[15, 25], zoom_start=2)
drawMap(map_buddhist, "Buddhist", colors.rgb2hex([1., 0., 1, 1.]))
map_buddhist

### Make a call to the foursquare API for each capital and get the most important venues

In [12]:
#CLIENT_ID = '2PEB55DZZ0P3QX45ROLHSAVQEQYTR03FC5A1RTSN5Y2G41IH' # your Foursquare ID
#CLIENT_SECRET = 'CP3XXKJGO0DXCBDDUQVCOA4WRHW335NVQKITTBSZ2KHYKYXK' # your Foursquare Secret
CLIENT_ID = '3JNKXJIDQQXS102P1O4252VATYCRHRMNAMFT1MYTDEB0QOFL' # your Foursquare ID
CLIENT_SECRET = 'ATFKERP1AWJWOB30LG0520C1MA11WUOLVVPPQPLMIDLRFK0B' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 1000

def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
        
        try:
            results = results["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            print("No venues found for ", name)

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['CountryName', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

venues = getNearbyVenues(names=capitals['CountryName'],
                                   latitudes=capitals['CapitalLatitude'],
                                   longitudes=capitals['CapitalLongitude']
                                  )

# Create a one hot encoding for the venue category
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
# add CountryName column back to dataframe
onehot['CountryName'] = venues['CountryName'] 

# move CountryName column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

# Calculate a mean distribution of venue categories for each country
grouped = onehot.groupby('CountryName').mean().reset_index()


No venues found for  Antarctica
No venues found for  Hong Kong
No venues found for  Heard Island and McDonald Islands
No venues found for  Macau


### Cluster all capitals depending on venue category distribution

In [71]:
# set number of clusters
kclusters = 7

grouped_clustering = grouped.drop('CountryName', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=133, n_jobs=-1).fit(grouped_clustering)

# Create a dataframe with predictions
prediction = pd.DataFrame(columns=('CountryName', 'Cluster Labels'))
prediction['CountryName'] = grouped['CountryName']
prediction['Cluster Labels'] = kmeans.labels_

# Sort the clusters by number of elements in them
labels_sorted = prediction.groupby('Cluster Labels').count().sort_values(by='CountryName', ascending=False).index.values
labels_sorted_idx = [np.where(labels_sorted == i)[0][0] for i in range(kclusters)]

# join the predictions with the big dataframe
capitalsWithLabels = full_valid.join(prediction.set_index('CountryName'), on='CountryName')
capitalsWithLabels['Cluster Labels'] = capitalsWithLabels['Cluster Labels'].fillna(-1).astype("int32")


### Show a map for all clusters

In [74]:
# create map
map_clusters = folium.Map(location=[15, 25], zoom_start=2)

# set color scheme for the clusters
colors_array = cm.rainbow([0, 1, 0.5, 0.25, 0.75, 0.125, 0.875, 0.375, 0.625, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
colors_array = [[1,0,0,1],[0,1,0,1],[0,0,1,1],[1,0,1,1],[0,1,1,1],[1,.5,0,1],[.5,0,1,1]]
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for i in capitalsWithLabels.index:
    label = folium.Popup(str(capitalsWithLabels.loc[i, 'CountryName']) + ' Cluster ' + str(capitalsWithLabels.loc[i, 'Cluster Labels']), parse_html=True)
    folium.CircleMarker(
        [capitalsWithLabels.loc[i, 'CapitalLatitude'], capitalsWithLabels.loc[i, 'CapitalLongitude']],
        radius=np.sqrt(float(capitalsWithLabels.loc[i, 'Population']))/np.sqrt(max_pop)*25.,
        popup=label,
        color=rainbow[labels_sorted_idx[capitalsWithLabels.loc[i, 'Cluster Labels']]],
        fill=True,
        weight=0,
        fill_color=rainbow[labels_sorted_idx[capitalsWithLabels.loc[i, 'Cluster Labels']]],
        fill_opacity=0.8).add_to(map_clusters)
       
        
# Complicated function to create a simple legend in a folium map
from branca.element import Template, MacroElement

template = """
{% macro html(this, kwargs) %}

<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>jQuery UI Draggable - Default functionality</title>
  <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">

  <script src="https://code.jquery.com/jquery-1.12.4.js"></script>
  <script src="https://code.jquery.com/ui/1.12.1/jquery-ui.js"></script>
  
  <script>
  $( function() {
    $( "#maplegend" ).draggable({
                    start: function (event, ui) {
                        $(this).css({
                            right: "auto",
                            top: "auto",
                            bottom: "auto"
                        });
                    }
                });
});

  </script>
</head>
<body>

 
<div id='maplegend' class='maplegend' 
    style='position: absolute; z-index:9999; border:2px solid grey; background-color:rgba(255, 255, 255, 0.8);
     border-radius:6px; padding: 10px; font-size:14px; right: 20px; bottom: 20px;'>
     
<div class='legend-title'>Legend</div>
<div class='legend-scale'>
  <ul class='legend-labels'>
    <li><span style='background:rgb(255,0,0);opacity:0.7;'></span>Cluster 1</li>
    <li><span style='background:rgb(0,255,0);opacity:0.7;'></span>Cluster 2</li>
    <li><span style='background:rgb(0,0,255);opacity:0.7;'></span>Cluster 3</li>
    <li><span style='background:rgb(255,0,255);opacity:0.7;'></span>Cluster 4</li>
    <li><span style='background:rgb(0,255,255);opacity:0.7;'></span>Cluster 5</li>
    <li><span style='background:rgb(255,127,0);opacity:0.7;'></span>Cluster 6</li>
    <li><span style='background:rgb(127,0,255);opacity:0.7;'></span>Cluster 7</li>

  </ul>
</div>
</div>
 
</body>
</html>

<style type='text/css'>
  .maplegend .legend-title {
    text-align: left;
    margin-bottom: 5px;
    font-weight: bold;
    font-size: 90%;
    }
  .maplegend .legend-scale ul {
    margin: 0;
    margin-bottom: 5px;
    padding: 0;
    float: left;
    list-style: none;
    }
  .maplegend .legend-scale ul li {
    font-size: 80%;
    list-style: none;
    margin-left: 0;
    line-height: 18px;
    margin-bottom: 2px;
    }
  .maplegend ul.legend-labels li span {
    display: block;
    float: left;
    height: 16px;
    width: 30px;
    margin-right: 5px;
    margin-left: 0;
    border: 1px solid #999;
    }
  .maplegend .legend-source {
    font-size: 80%;
    color: #777;
    clear: both;
    }
  .maplegend a {
    color: #777;
    }
</style>
{% endmacro %}"""

macro = MacroElement()
macro._template = Template(template)

map_clusters.get_root().add_child(macro)
map_clusters