In [1]:
# Import necessary packages
import requests
from lxml import etree
import re

In [9]:
# Find and extract all hyperlinks in the url page
def find_hrefs(url, hrefs, headers, base_url, task):
    '''
    url: request site
    
    hrefs: all previous hyperlinks
    headers: HTTP headers
    '''
    # request the url
    response = requests.get(url = url, headers = headers)
    selector = etree.HTML(response.text)
    hrefs_temp = selector.xpath('//@href')
    if task == "UN":
        # extract only hyperlink starting with "/en"
        hrefs_temp = [base_url + "/" + s.lstrip('/en')  for s in hrefs_temp if s.startswith('/en') and not s.endswith('.xml')]
    elif task == "EU":
        hrefs_temp = [base_url + "/" + s.lstrip(base_url)  for s in hrefs_temp if "press-room" in s and not s.endswith('.xml')]
    hrefs_temp = hrefs + hrefs_temp
    # every href in the output should be unique
    hrefs_unique = list(set(hrefs_temp))
    hrefs_unique.sort(key=hrefs_temp.index)
    return (hrefs_unique)

# Whether page satisfies certian conditions
def type_keyword_judge(url, headers, url_type, type_xpath, keyword, keyword_xpath, task, n_release):
    '''
    url: request site
    url_type: whether page belongs to the type
    type_xpath: type xpath
    keyword: whether page contains the keyword
    keyword_xpath: keyword xpath
    task: UN or EU task
    n_release: number of press release
    '''
    response = requests.get(url = url, headers = headers)
    selector = etree.HTML(response.text)
    # type consistency
    anchor_tag = url_type in selector.xpath(type_xpath)
    # keyword consistency
    if anchor_tag:
        text = selector.xpath(keyword_xpath)
        # remove all punctuation marks
        long_text_lower = re.sub(r'[^\w\s]', '', " ".join(text))
        # replace other spaces (like newline) with x20 space 
        long_text_lower = re.sub(r'[^\w]', ' ', long_text_lower)
        # whether page contains the keyword
        if (" " + keyword + " ") in (" " + long_text_lower + " "):
            save_page(selector, task, n_release)
            return True
        else:
            return False
    else:
        return False

# One round scraping all hyperlinks containing in the current urls list
def single_crawler(output, base_url, headers, url_type, type_xpath, keyword, keyword_xpath, task, n_release):
    '''
    output is a dic containing three elements:
        hrefs: a list consisting of all urls to request
        index: index of the previous hrefs list denoting all pages satisfying certain conditions
        len: where to start requesting in the hrefs list
    '''
    temp_hrefs = output["hrefs"].copy()
    index = output["index"]
    # start requesting from len-th element in the href list
    for href in output["hrefs"][output["len"]:]:
        # extract and store hyperlinks
        temp_hrefs =  find_hrefs(href, temp_hrefs, headers, base_url, task)
        # if having found one, store its index in the hrefs list
        if type_keyword_judge(href, headers, url_type, type_xpath, keyword, keyword_xpath, task, n_release) == 1:
            # a reminder
            print(href, "saved as", task, "task", n_release, "th file")
            n_release += 1
            # url index
            index.append(output["hrefs"].index(href))
    return {"hrefs": temp_hrefs, "index": index, "len": len(output["hrefs"])}            

# Recursively adopt the previous search
def recursively_crawlers(base_url, headers, url_type, type_xpath, keyword, keyword_xpath, task):
    n_release = 1
    output = {"hrefs": find_hrefs(base_url, [], headers, base_url, task),
              "index": [],
              "len": 0}
    while n_release < 10:
        output = single_crawler(output, base_url, headers, url_type, type_xpath, keyword, keyword_xpath, task, n_release)
        n_release = len(output["index"]) + 1
    return output

# Save html code as txt file
def save_page(selector, task, n_release):
    html_as_string = etree.tostring(selector, pretty_print = True, encoding = "utf-8").decode()
    if task == "UN":
        task = 1
    elif task == "EU":
        task = 2
    with open(str(task) + "_" + str(n_release) + ".txt", 'w') as f: 
        f.write(html_as_string)
        f.close() 


In [10]:
# Scrape the UN press room
base_url = "https://press.un.org/en"

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

url_type = "Press Release"
type_xpath = '//a[@href = "/en/press-release" and @hreflang = "en"]/text()'
keyword = "crisis"
keyword_xpath = '//h1[@class = "page-header"]/text() | //div[@class = "field field--name-body field--type-text-with-summary field--label-hidden field__item"]/p//text()'

output = recursively_crawlers(base_url, headers, url_type, type_xpath, keyword, keyword_xpath, task = "UN")

https://press.un.org/en/2023/sgsm21982.doc.htm saved as UN task 1 th file
https://press.un.org/en/2023/sgsm21980.doc.htm saved as UN task 2 th file
https://press.un.org/en/2023/sgsm21978.doc.htm saved as UN task 3 th file
https://press.un.org/en/2023/sgsm21947.doc.htm saved as UN task 4 th file
https://press.un.org/en/2023/dsgsm1874.doc.htm saved as UN task 5 th file
https://press.un.org/en/2023/sgsm21952.doc.htm saved as UN task 6 th file
https://press.un.org/en/2023/sgsm21876.doc.htm saved as UN task 7 th file
https://press.un.org/en/2023/sgsm21852.doc.htm saved as UN task 8 th file
https://press.un.org/en/2023/sgsm21806.doc.htm saved as UN task 9 th file
https://press.un.org/en/2023/dsgsm1848.doc.htm saved as UN task 10 th file
https://press.un.org/en/2023/sgsm21765.doc.htm saved as UN task 11 th file
https://press.un.org/en/2023/sgsm21767.doc.htm saved as UN task 12 th file
https://press.un.org/en/2023/sgsm21723.doc.htm saved as UN task 13 th file
https://press.un.org/en/2023/dsgsm

In [5]:
# Scrape the EU press room
base_url = "https://www.europarl.europa.eu/news/en/press-room"

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

url_type = "Plenary session"
type_xpath = '//span[@class = "ep_name"]/text()'
keyword = "crisis"
keyword_xpath = '//*[@id="website-body"]/div[1]/div/div[2]/div/div/h1/div/span[1]/text() | //*[@id="website-body"]/div[2]/div/div[3]/div/div/div[1]/div/div/div/ul//span[@class = "ep_name"]/text() | //*[@id="website-body"]/div[2]/div/div[3]/div/div/div[2]/div/div/p/text() | //p[@class = "ep-wysiwig_paragraph"]/text()'

# Need specify urls instead of crawling automatically
n_articles = 1
n_page = 0
while n_articles < 10:
    output = {"hrefs": find_hrefs(base_url + "/page/" + str(n_page), [], headers, base_url, "EU"),
              "index": [],
              "len": 0}
    output = single_crawler(output, base_url, headers, url_type, type_xpath, keyword, keyword_xpath, "EU", n_articles)
    n_articles += len(output["index"])
    n_page += 1

https://www.europarl.europa.eu/news/en/press-room/20230929IPR06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan saved as EU task 1 th file
https://www.europarl.europa.eu/news/en/press-room/20230929IPR06130/parliament-argues-for-a-top-up-to-multi-annual-budget-for-crisis-response saved as EU task 2 th file
https://www.europarl.europa.eu/news/en/press-room/20230911IPR04923/reduce-demand-and-protect-people-in-prostitution-say-meps saved as EU task 3 th file
https://www.europarl.europa.eu/news/en/press-room/20230911IPR04918/svietlana-tsikhanouskaya-to-meps-support-belarusians-european-aspirations saved as EU task 4 th file
https://www.europarl.europa.eu/news/en/press-room/20230911IPR04908/meps-vote-to-strengthen-eu-defence-industry-through-common-procurement saved as EU task 5 th file
https://www.europarl.europa.eu/news/en/press-room/20230707IPR02427/covid-19-parliament-adopts-roadmap-to-better-prepare-for-future-health-crises saved as EU task 6 th file
https://www.e