In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
%matplotlib inline

In [2]:
url = 'https://en.wikipedia.org/wiki/'
r = requests.get(url + 'Afghanistan')

soup = BeautifulSoup(r.text,'html.parser')
soup = soup.find('table', {'class': 'infobox geography vcard'})
tr_tags = soup.find_all('tr')

## Gathering language, country codes and area of each country
Take dataset from https://github.com/mledoze/countries

In [145]:
def get_language(df):
    languages = dict()
    
    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        languages.update({country: list(language_dict.values())})
    
    return languages

            
def get_language_codes(df):
    language_codes = dict()

    for country, language_dict in zip(df.name.values, df.dict_of_languages.values):
        if len(language_dict.keys()) > 0:
            language_codes.update({country: list(language_dict.keys())})
        else:
            language_codes.update({country: ['unknown']})
    
    return language_codes

def add_columns(df):
    # Adding name column
    df['name'] = [name['common'] for name in df.dict_of_names.values]
    
    # Generating the right formated columns
    languages = get_language(df)
    language_codes = get_language_codes(df)
    
    # Adding languages & language codes columns
    df['languages'] = list(languages.values())
    df['language_codes'] = list(language_codes.values())

    return df

In [156]:
countries = pd.read_json('countries.json')
countries.rename(columns={'name': 'dict_of_names', 'languages': 'dict_of_languages'}, inplace=True)

# Add country name, language & language code as new columns
countries = add_columns(countries)

# Select useful columns
cols = ['area', 'cca2', 'cca3', 'ccn3', 'name', 'language_codes', 'languages', 'dict_of_languages']
countries = countries[cols]
countries.set_index('name', inplace=True)

countries.head()

Unnamed: 0_level_0,area,cca2,cca3,ccn3,language_codes,languages,dict_of_languages
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aruba,180.0,AW,ABW,533,"[nld, pap]","[Dutch, Papiamento]","{'nld': 'Dutch', 'pap': 'Papiamento'}"
Afghanistan,652230.0,AF,AFG,4,"[prs, pus, tuk]","[Dari, Pashto, Turkmen]","{'prs': 'Dari', 'pus': 'Pashto', 'tuk': 'Turkm..."
Angola,1246700.0,AO,AGO,24,[por],[Portuguese],{'por': 'Portuguese'}
Anguilla,91.0,AI,AIA,660,[eng],[English],{'eng': 'English'}
Åland Islands,1580.0,AX,ALA,248,[swe],[Swedish],{'swe': 'Swedish'}


## Gathering population
For the population we used http://www.thearda.com/Archive/Files/Downloads/WRDNATL_DL2.asp 

In [170]:
population_df = pd.read_excel('World Religion Dataset - National Religion Dataset.xlsx')
population_df = population_df.loc[population_df['YEAR'] == '2010']

cols = ['ISO3', 'NUMISO', 'COUNTRY', 'POP'] + [col for col in population_df.columns if 'PCT' in col][:-1]
population_df = population_df[cols]
population_df.head()

Unnamed: 0,ISO3,NUMISO,COUNTRY,POP,CHPRTPCT,CHCATPCT,CHORTPCT,CHANGPCT,CHOTHPCT,CHGENPCT,...,SIGENPCT,SHGENPCT,BAGENPCT,TAGENPCT,JAGENPCT,COGENPCT,SYGENPCT,ANGENPCT,NORELPCT,OTGENPCT
13,USA,840.0,United States of America,312750000,0.3829,0.2507,0.022499,0.015499,0.0738,0.7454,...,0.001299,0.0005,0.0015,0.0,0.0003,0.0003,0.002599,0.005699,0.19,0.0025
27,CAN,124.0,Canada,34500000,0.2298,0.4202,0.022799,0.078899,0.014399,0.7661,...,0.007999,0.0,0.0005,9.9e-05,9.9e-05,9.9e-05,0.0008,0.0021,0.1643,0.001
35,BHS,44.0,Bahamas,313312,0.676,0.14,0.0,0.15,0.0,0.966,...,0.0,0.0,0.0,0.0003,0.0,0.0,0.0,0.0032,0.028999,0.0005
49,CUB,192.0,Cuba,11241161,0.048899,0.6,0.0,0.0,0.009999,0.6589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.1315,0.0
63,HTI,332.0,Haiti,9760832,0.1,0.72,0.0,0.0,0.0,0.82,...,0.0,0.0,0.0009,0.0,0.0,0.0,0.45,0.0,0.1,0.0
