This notebook will be dedicated to collecting documents of economic importance regarding the IMF for analysis.

### Notes
Depending on your location you may need a VPN connection to access the IMF website

In [None]:
import time
from tqdm import tqdm

In [None]:
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [None]:
driver = webdriver.Chrome()

In [None]:
url = 'https://www.imf.org/en/Countries/KEN'
driver.get(url)

## Links

### Get Document URLs

In [None]:
results = driver.find_elements(By.CLASS_NAME, "result-item")

doc_urls = [element.find_element(By.TAG_NAME, "a").get_attribute('href') for element in results]

: 

### Multipage Scraping

#### Helper functions

In [None]:
def get_doc_urls():
    results = driver.find_elements(By.CLASS_NAME, "result-item")
    doc_urls = [element.find_element(By.TAG_NAME, "a").get_attribute('href') for element in results]
    return doc_urls

#### Navigator Arrows

In [None]:
next_page = driver.find_elements(By.XPATH, "//a[@class='pagination-arrow next']")
[page.get_attribute('href') for page in next_page]
# next_page[1].click()

In [None]:
previous_page = driver.find_elements(By.XPATH, "//a[@class='pagination-arrow previous']")
[page.get_attribute('href') for page in previous_page]
# previous_page[0].click()

#### Navigation

In [None]:
all_doc_urls = []

try:
    while len(next_page) != 0:
        res = get_doc_urls()
        all_doc_urls.append(res)
        
        next_page[0].click()
        time.sleep(2.5)
except exceptions.StaleElementReferenceException:
    print(f"Advanced upto page: {len(all_doc_urls)}")

In [None]:
driver.quit()

#### Postprocessing

In [None]:
def flatten_list(nested_list):
    flattened_list = []
    for element in nested_list:
        if isinstance(element, list):
            flattened_list.extend(flatten_list(element))
        else:
            flattened_list.append(element)
    return flattened_list

In [None]:
flat_doc_urls = flatten_list(all_doc_urls)

#### Save

In [None]:
with open("../data/imf_document_links.txt", "w", encoding='utf-8') as f:
    for url in flat_doc_urls:
        f.write(url + '\n')

## Articles
Having obtained a list of all the links to the articles, we now visit each individually and scrape their contents.

### Note
We need to handle the `page not found` error, note which are missing and proceed with the next.

In [None]:
driver = webdriver.Chrome()

In [None]:
articles = []

In [None]:

for page_no, url in enumerate(flat_doc_urls):
    driver.get(url)

    try:
        article = driver.find_element(By.TAG_NAME, "article").text

    except AttributeError:
        # Means this element not found. Try to scrape the whole page instead
        article = driver.find_element(By.TAG_NAME, 'body').text

    except exceptions.NoSuchElementException:
        print(f"Article element not found on page {page_no}")
        article = ''    # Partial fix for page not found.

    except Exception as e:
        print(f"The following error occured: {e} \n\n")
        print(f"Advanced up to page number: {page_no}")

    articles.append(article)

    time.sleep(1.5)


In [None]:
len(articles)

In [None]:
driver.quit()

### Save

In [None]:
with open("../data/imf_article_txt", "w", encoding='utf-8') as f:
    for article in articles:
        f.write("\n\n" + article)
        f.write("\n\n" + "-" * 150)