Within this notebook we scrape the world bank blog post webpages.

## NOTE
Remember to have your `VPN` active.

In [None]:
import time, random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common import exceptions as e
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import InvalidArgumentException, StaleElementReferenceException

In [None]:
from tqdm import tqdm
from IPython.display import Markdown, display

In [None]:
driver = webdriver.Chrome()

In [None]:
driver.implicitly_wait(10)

In [None]:
url = 'https://blogs.worldbank.org/search?keyword=&f%5B0%5D=countries%3A143&f%5B1%5D=language%3Aen'
driver.get(url)

### Blog Post URLs

In [None]:
all_urls = []

while True:
    time.sleep(random.uniform(3, 4))
    blog_posts = driver.find_elements(By.XPATH, "//div[@class='blog_teaser']")
    try:
        url = [blog.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a').get_attribute('href') for blog in blog_posts]
    except StaleElementReferenceException:
        driver.refresh()
        continue
    next_page = driver.find_element(By.XPATH, "//a[@title='Go to next page']")
    next_page.click()
    
    all_urls.append(url)

In [None]:
driver.quit()

### Postprocessing

In [None]:
def flatten_list(nested_list):
    flattened_list = []
    for element in nested_list:
        if isinstance(element, list):
            flattened_list.extend(flatten_list(element))
        else:
            flattened_list.append(element)
    return flattened_list

In [None]:
all_urls_flattened = flatten_list(all_urls)
print(len(all_urls_flattened))
all_urls_flattened[:5]

### Save and Read

In [None]:
with open("data/world_bank_document_urls.txt", "w", encoding='utf-8') as f:
    for url in all_urls_flattened:
        f.write(url)

In [4]:
with open('data/sample.txt', "w") as f:
    f.writelines(['a', 'b', 'c'])

## Articles

In [1]:
from async_get import acreate_coroutines

### Loading

In [2]:
with open("data/world_bank_document_urls.txt", encoding='utf-8') as f:
    article_urls = f.readlines()
    article_urls = [url.replace('\n', '') for url in article_urls]
len(article_urls)

1120

In [None]:
article_urls = list(set(article_urls))          # remove duplicates

In [3]:
articles = await acreate_coroutines(article_urls[:5])
articles

['\n<!DOCTYPE HTML>\n<html lang="en">\n    <head>\r\n\r\n    <meta content="text/html; charset=UTF-8" http-equiv="content-type"/>\r\n    \r\n\r\n    <link href="/content/dam/wbr-redesign/logos/wbg-favicon.png" rel="shortcut icon" type="image/png"/>\r\n\r\n    \r\n    <title>Putting Africa on the path to prosperity </title>\r\n    <meta name="keywords" content="Equitable Growth, Finance and Institutions,Africa,Voices"/>\r\n    \r\n    \r\n    <meta content="blog-details-page" name="template"/>\r\n    <meta content="/content/worldbankgroup/blogs/en/blogs/voices/putting-africa-on-the-path-to-prosperity" name="pagepath"/>\r\n    <meta content="width=device-width, initial-scale=1" name="viewport"/>\r\n    \r\n\r\n    \r\n\r\n    \r\n        <link crossorigin="" href="https://fonts.googleapis.com" rel="preconnect"/>\n<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>\n<link crossorigin="" href="https://assets.adobedtm.com" rel="preconnect"/>\n<link crossorigin="" href="

In [None]:
driver = webdriver.Chrome()

In [None]:
# Parse html and extract text


### Helper Functions

In [None]:
def get_article_text():
    driver.switch_to.default_content()
    sections = driver.find_elements(By.XPATH, "//section[@class='mainsection']")
    
    return sections[0].text

### Webscraping Mechanism

In [None]:
articles = []
skipped = []
for article_no, article_url in enumerate(tqdm(article_urls)):
    try:
        driver.get(article_url)

        article = driver.find_element(By.TAG_NAME, 'body').text
        # article = get_article_text()
        articles.append(article)
        
    except Exception as e:
        skipped.append(article_no)
        print(f"The following occurred on article number {article_no}: {e} \nLoading next article...")
        continue

    time.sleep(random.uniform(1.5, 2.5))

### Save

In [None]:
with open("data/world_bank_articles.txt", "w", encoding='utf-8') as f:
    for article in articles:
        f.write("\n\n" + article)
        f.write("\n\n" + "-" * 150)