Within this notebook we scrape the world bank blog post webpages.

## NOTE
Remember to have your `VPN` active.

In [1]:
import time 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common import exceptions as e
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [44]:
from tqdm import tqdm
from IPython.display import Markdown, display

In [2]:
driver = webdriver.Chrome()

In [3]:
driver.implicitly_wait(10)

In [30]:
url = 'https://blogs.worldbank.org/search?keyword=&f%5B0%5D=countries%3A143&f%5B1%5D=language%3Aen'
driver.get(url)

### Blog Post URLs

In [17]:
def get_urls():
    url_webelements = [webelement.find_element(By.TAG_NAME, 'a') 
                    for webelement in driver.find_elements(By.XPATH, "//h3[@class='field-content']")]
    urls = [webelement.get_attribute('href') for webelement in url_webelements]
    return urls

### Paginator
This function will *flip the page*, so to speak; thus encapsulating the slightly complex logic.

In [53]:
def paginate() -> list : 
    # Element found each iteration to avoid StaleElement exceptions
    next_page = driver.find_elements(By.XPATH, "//a[@title='Go to next page']")

    next_page_url = next_page[0].get_attribute('href')
    driver.get(next_page_url)

    return next_page

### Navigation

In [None]:
all_urls = []

next_page = driver.find_elements(By.XPATH, "//a[@title='Go to next page']")

try:
    # Redundant condition, meant to show that we have reached the last page. 
    # Replace with while True
    while len(next_page) != 0:
        res = get_urls()
        all_urls.append(res)

        next_page = paginate()
        time.sleep(2.5)
except IndexError:
    print("Succesfully reached the last page")

In [32]:
len(all_urls)

46

### Postprocessing

In [33]:
def flatten_list(nested_list):
    flattened_list = []
    for element in nested_list:
        if isinstance(element, list):
            flattened_list.extend(flatten_list(element))
        else:
            flattened_list.append(element)
    return flattened_list

In [41]:
all_urls_flattened = flatten_list(all_urls)
print(len(all_urls_flattened))
all_urls_flattened[:5]

452


['https://blogs.worldbank.org/education/tale-two-early-grade-reading-programs',
 'https://blogs.worldbank.org/dev4peace/many-refugee-shocks-can-be-predicted-and-we-can-prepare-them',
 'https://blogs.worldbank.org/health/training-more-nurses-bring-quality-health-care-hard-reach-communities-kenya',
 'https://blogs.worldbank.org/dev4peace/beyond-humanitarian-assistance-enabling-refugees-economic-empowerment-through',
 'https://blogs.worldbank.org/youth-transforming-africa/nurturing-minds-fueling-futures-kenya-conversation-wawira-njiru']

In [45]:
article_urls = all_urls_flattened

### Save and Read

In [35]:
with open("data/world_bank_document_urls.txt", "w", encoding='utf-8') as f:
    for url in all_urls_flattened:
        f.writelines(url + '\n')

## Articles

### Loading

In [49]:
with open("data/world_bank_document_urls.txt", encoding='utf-8') as f:
    article_urls = f.readlines()
len(article_urls)

452

In [50]:
driver = webdriver. Chrome()

### Helper Functions

In [43]:
def get_article_text():
    driver.switch_to.default_content()
    sections = driver.find_elements(By.XPATH, "//section[@class='mainsection']")
    
    return sections[0].text

### Webscraping Mechanism

In [51]:
articles = []
skipped = []
for article_no, article_url in enumerate(tqdm(article_urls)):
    try:
        driver.get(article_url)

        article = get_article_text()
        articles.append(article)
    except Exception as e:
        skipped.append(article_no)
        print(f"The following occurred on article number {article_no}: {e} \nLoading next article...")
        continue

    time.sleep(1)

  0%|          | 1/452 [00:06<49:14,  6.55s/it]

The following occurred on article number 0: list index out of range 
Loading next article...


100%|██████████| 452/452 [41:48<00:00,  5.55s/it]


### Save

In [54]:
with open("data/world_bank_articles.txt", "w", encoding='utf-8') as f:
    for article in articles:
        f.write("\n\n" + article)
        f.write("\n\n" + "-" * 150)