In [1]:
## Data Collection

In [2]:
# import libraries
import pandas as pd
import numpy as np
import re # for regular expressions
from bs4 import BeautifulSoup # for web scraping
import requests # for url requests 
from itertools import islice # for cleaning tables on CIA website 

In [3]:
# paste url from CIA World Factbook Country Comparisons page
url = 'https://www.cia.gov/the-world-factbook/references/guide-to-country-comparisons/'
# make website request for information
r = requests.get(url)
# check website is responding
r.status_code

200

In [4]:
# collect all html from Country Comparisons
soup = BeautifulSoup(r.text)

In [13]:
# collect all the country comparison chart links from url
links = soup.find_all('a', {"class":"link-button bold"})
links = [link['href'] for link in links if "/country-comparison" in link['href']]
len(links)

52

In [6]:
# save the specific chart pages I am interested in
pages_links = ["maternal-mortality-ratio", "infant-mortality-rate", "education-expenditures", "unemployment-rate",
               "gini-index-coefficient-distribution-of-family-income", "internet-users", "broadband-fixed-subscriptions"]

In [7]:
# grab the country comparison chart links listed in pages_links
tables=[]
for num in range(len(links)):
    for j in range(len(pages_links)):
        if pages_links[j] in links[num]:
            tables.append(links[num])
# remove youth-unemployment-rate page from table_links
ele = '/the-world-factbook/field/youth-unemployment-rate-ages-15-24/country-comparison'
table_links=[k for l,k in enumerate(tables) if k!=ele]

9

In [8]:
# create empty dictionary 
title = {"page":[], "headers":[], "cells":[]}

# Grab chart information from each in table_links
for url in table_links:
    r = requests.get("https://www.cia.gov"+url)
    bs = BeautifulSoup(r.text)
    #grab page name for reference
    page = bs.title.text
    title["page"].append(page)
    #grab table headers
    table = bs.find(class_="content-table table-auto")
    table_headers = table.find_all("th")
    list_head = []
    for header in table_headers:
        list_head.append(header.text.strip())
    title["headers"].append(list_head)
    #grab table cell contents and combine back into rows
    content = [row.text for row in table.find_all("td")]
    sliced_content = []
    #the code below uses islice to read 4 "cells" at a time and combine them
    for (a,b,c,d) in zip(islice(content,0,None,4), islice(content,1,None,4), 
                         islice(content,2,None,4), islice(content,3,None,4)):
        sliced_content.append([a,b,c,d])
    title["cells"].append(sliced_content)

In [9]:
## Save Data to csv files

In [10]:
# transform tables into dataframes to clean
cia_dict = {}
for i in range(7):
    cia_dict[i] = pd.DataFrame(title["cells"][i], columns=title["headers"][i])

In [11]:
# separate and name each data frame
maternal_mortality = cia_dict[0]
infant_mortality = cia_dict[1]
education_expenditures = cia_dict[2]
unemployment = cia_dict[3]
gini_index = cia_dict[4]
internet_users = cia_dict[5]
broadband = cia_dict[6]

In [12]:
# Save the dataframes to .csv files
#comment out below files to avoid saving copies
#maternal_mortality.to_csv('maternal_mortality.csv', index=False, sep ='\t')
#infant_mortality.to_csv('infant_mortality.csv', index=False, sep ='\t')
#education_expenditures.to_csv('education_expenditures.csv', index=False, sep ='\t')
#unemployment.to_csv('unemployment.csv', index=False, sep ='\t')
#gini_index.to_csv('gini_index.csv', index=False, sep ='\t')
#internet_users.to_csv('internet_users.csv', index=False, sep ='\t')
#broadband.to_csv('broadband.csv', index=False, sep ='\t')