Within this notebook we scrape the world bank blog post webpages.

## NOTE
Remember to have your `VPN` active.

In [7]:
import time, random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common import exceptions as e
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    InvalidArgumentException, StaleElementReferenceException, ElementClickInterceptedException
    )

In [2]:
from tqdm import tqdm
from IPython.display import Markdown, display

In [3]:
driver = webdriver.Chrome()

In [4]:
driver.implicitly_wait(10)

In [5]:
url = 'https://blogs.worldbank.org/search?keyword=&f%5B0%5D=countries%3A143&f%5B1%5D=language%3Aen'
driver.get(url)

### Blog Post URLs

In [6]:
all_urls = []

while True:
    time.sleep(random.uniform(3, 4))
    blog_posts = driver.find_elements(By.XPATH, "//div[@class='blog_teaser']")
    try:
        url = [blog.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a').get_attribute('href') for blog in blog_posts]
    except StaleElementReferenceException:
        driver.refresh()
        continue
    
    # Click next page with retries
    for _ in range(3):
        try:
            next_page = driver.find_element(By.XPATH, "//a[@title='Go to next page']")
            next_page.click()
            break
        except ElementClickInterceptedException:
            driver.refresh()

    all_urls.append(url)

ElementClickInterceptedException: Message: element click intercepted: Element <a _ngcontent-c0="" class="page-link" rel="next" title="Go to next page">...</a> is not clickable at point (953, 584). Other element would receive the click: <div _ngcontent-c0="" class="ajax-div ng-tns-c0-0">...</div>
  (Session info: chrome=126.0.6478.62)
Stacktrace:
	GetHandleVerifier [0x00007FF73AF23E32+31618]
	(No symbol) [0x00007FF73AE9B099]
	(No symbol) [0x00007FF73AD5888A]
	(No symbol) [0x00007FF73ADB022E]
	(No symbol) [0x00007FF73ADADBE2]
	(No symbol) [0x00007FF73ADAB07B]
	(No symbol) [0x00007FF73ADAA246]
	(No symbol) [0x00007FF73AD9C281]
	(No symbol) [0x00007FF73ADCD10A]
	(No symbol) [0x00007FF73AD9BBA6]
	(No symbol) [0x00007FF73ADCD320]
	(No symbol) [0x00007FF73ADECA80]
	(No symbol) [0x00007FF73ADCCEB3]
	(No symbol) [0x00007FF73AD9A46B]
	(No symbol) [0x00007FF73AD9B001]
	GetHandleVerifier [0x00007FF73B229FFD+3202381]
	GetHandleVerifier [0x00007FF73B276A1D+3516269]
	GetHandleVerifier [0x00007FF73B26C490+3473888]
	GetHandleVerifier [0x00007FF73AFD5D36+760454]
	(No symbol) [0x00007FF73AEA6B3F]
	(No symbol) [0x00007FF73AEA1CD4]
	(No symbol) [0x00007FF73AEA1E62]
	(No symbol) [0x00007FF73AE9120F]
	BaseThreadInitThunk [0x00007FFD7EE87344+20]
	RtlUserThreadStart [0x00007FFD7F0DCC91+33]


In [None]:
driver.quit()

### Postprocessing

In [8]:
def flatten_list(nested_list):
    flattened_list = []
    for element in nested_list:
        if isinstance(element, list):
            flattened_list.extend(flatten_list(element))
        else:
            flattened_list.append(element)
    return flattened_list

In [18]:
all_urls_flattened = flatten_list(all_urls)
print(len(all_urls_flattened))
all_urls_flattened[:5]

1620


['https://blogs.worldbank.org/en/voices/financing-a-bright-future-for-south-asia',
 'https://blogs.worldbank.org/en/nasikiliza/enhancing-inclusive-growth-and-resilience-with-support-uganda-development-focused-refugee-approach-afe-0624',
 'https://blogs.worldbank.org/en/latinamerica/refugees-long-road-to-integration-in-latin-america',
 'https://blogs.worldbank.org/en/governance/integrity-and-transparency-of-spending-and-security-in-sub-sahar',
 'https://blogs.worldbank.org/en/climatechange/the-many-stories-of-adaptation-finance-']

### Save and Read

In [23]:
with open("data/world_bank_document_urls.txt", "w", encoding='utf-8') as f:
    for url in all_urls_flattened:
        f.write(f"{url}\n")

## Articles

In [24]:
from custom.async_get import acreate_coroutines

from bs4 import BeautifulSoup
from IPython.display import display, Markdown, HTML

### Loading

In [25]:
with open("data/world_bank_document_urls.txt", encoding='utf-8') as f:
    article_urls = f.readlines()
    article_urls = [url.replace('\n', '') for url in article_urls]
len(article_urls)

1620

In [26]:
article_urls = list(set(article_urls))          # remove duplicates

### HTTP Get

In [27]:
articles = await acreate_coroutines(article_urls)

### Extract text and useful metadata

In [28]:
article_texts = []
for article in tqdm(articles):
    soup = BeautifulSoup(article, 'html.parser')
    
    try:
        title = soup.find('h1', class_="blog_teaser__title").text
        authors = soup.find('div', class_="blog_teaser__link_container").text
        body = soup.find('div', class_="cmp-text").text

    except AttributeError:
        title = None
        authors = None
        body = soup.find('body').text


    article_text = f"""
        Title:      {title}
        
        Authors & Date Published : {authors}    

        Article Body:   {body}
    """
    article_texts.append(article_text)

100%|██████████| 1619/1619 [01:20<00:00, 20.13it/s]


### Save Article Text

In [29]:
with open("data/world_bank_articles.txt", "w", encoding='utf-8') as f:
    for article_text in article_texts:
        f.write("\n\n" + article_text)
        f.write("\n\n" + "-" * 150)

## Legacy
This is deprecated as it is costly both in terms of time and compute resources. The use of asynchronous get requests supercedes it. In fact, it leaves it in the dust.

In [17]:
# driver = webdriver.Chrome()
# def get_article_text():
#     driver.switch_to.default_content()
#     sections = driver.find_elements(By.XPATH, "//section[@class='mainsection']")
    
#     return sections[0].text
# articles = []
# skipped = []
# for article_no, article_url in enumerate(tqdm(article_urls)):
#     try:
#         driver.get(article_url)

#         article = driver.find_element(By.TAG_NAME, 'body').text
#         # article = get_article_text()
#         articles.append(article)
        
#     except Exception as e:
#         skipped.append(article_no)
#         print(f"The following occurred on article number {article_no}: {e} \nLoading next article...")
#         continue

#     time.sleep(random.uniform(1.5, 2.5))