In [1]:
# Import dependencies
import pandas as pd
import json
import re
import concurrent.futures
import time
import pymongo
from config import password

# Retrieve the geoJSON data which contains the list of country names
with open("../static/data/countries.geojson") as f:
    data = json.load(f)
    
    f.close()
    
# Use list comprehension to retreive the names from the json
names = [i["properties"]["ADMIN"] for i in data["features"]]

# Building a function which can be used in a loop
def scrape_wiki(country):
    # Building the query URL
    url = "https://en.wikipedia.org/wiki/"
    queryURL = url + country.replace(" ", "_")

    # Using pandas to scrape the URL
    wiki = pd.read_html(queryURL)[0]
    wiki.columns = [1, 2]
    
    # Dropping any N/A values
    wiki.dropna(how = "any", inplace = True)
    wiki.reset_index(drop = True, inplace = True)
    
    # Cleaning the data
    wiki[1] = wiki[1].apply(lambda x: x.replace("•", ""))
    
    # Defining target values & creating an empty list
    target = ["Capital", ".+language", "Population", "GDP\xa0(PPP)", "Currency"]
    values = []

    # Looping through the target values
    for i in target:
        # Check if the target is not in the list
        if i not in list(wiki[1]):
            # Checking for the regex expression
            if i == ".+language":
                for x in range(0, len(wiki)):
                    # If the regex expression is found, take the first value and end the loop
                    if re.search(i, wiki[1][x]):
                        values.append(wiki[2][x])
                        break
            else:
                # If nothing is found, append N/A
                values.append("N/A")
        else:
            # Looping through the dataframe to find data
            for x in range(0, len(wiki)):

                if wiki[1][x] == i:
                    if wiki[1][x] == "Population":
                        values.append(wiki[2][x + 1])
                        if wiki[1][x + 3] == "GDP\xa0(PPP)":
                            values.append(wiki[2][x + 2])
                        else:
                            values.append(wiki[2][x + 3])

                    elif wiki[1][x] == "GDP\xa0(PPP)":
                        values.append(wiki[2][x + 1])
                        values.append(wiki[2][x + 2])

                    else:
                        values.append(wiki[2][x])
                        
    # If a capital city is found, clean the value
    match = re.match(r"([a-z]+)([0-9]+)", values[0], re.I)
    if match:
        items = match.groups()
        values[0] = items[0]

    vals = []

    # Removing [ values from the data
    for x in values:
        vals.append(x.split("[")[0])

    vals[3] = vals[3].split(" ")[0]

    # Adding the country name
    vals.insert(0, country)
    
    # Creating the titles for the table
    titles = ["Country Name: ", "Capital City: ", "Official Language: ", 
              "Population: ", "Population Density: ", "GDP Total: ", "GDP Per Capita: ", "Currency: "]

    # Creating a dataframe to be stored
    df = pd.DataFrame(vals, titles)
    
    df.columns = ["data"]

    return df

In [2]:
# Creating an empty dictionary to hold all of the country data
d = {}

In [3]:
missed = []

# Creating a function to run the scrape wiki function for each country
def scrape(i):
    try:
        a = scrape_wiki(i).to_dict() # Added to.dict() so that the result only contains dictionaries
        d[i] = a
    except:
        missed.append(i)
        
# Creating a function that asynchronously runs the scrape function
def download_results(names):
    threads = 30
    
    with concurrent.futures.ThreadPoolExecutor(max_workers = threads) as executor:
        executor.map(scrape, names)

# Creating a main function to call the previous functions
def main(names):
    t0 = time.time()
    download_results(names)
    t1 = time.time()
    
    print(f"{t1-t0} seconds to download {len(names) - len(missed)} countries.\n{len(missed)} could not be retreived")

In [4]:
# Calling the main function
main(names)

24.277078866958618 seconds to download 202 countries.
53 could not be retreived


In [5]:
# Creating a connection to the MongoDB database
conn = f"mongodb+srv://admin:{password}@cluster0.c0z5f.mongodb.net/countries_info_db?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)

# Connecting to the correct database and collection
db = client.countries_info_db
countries_info = db.countries_info

# Updating the data in the collection with the scrape results
countries_info.update({}, d, upsert = True)

# Closing the connection
client.close()

  countries_info.update({}, d, upsert = True)
