In [1]:
import time
import DoS_scraper
from DoS_scraper import get_allnews_urls
from DoS_scraper import parse_page

DoS_urls = get_allnews_urls(begin_year=2018, end_year=2019, verbose=True)
print('%d urls for all news' % len(DoS_urls))


get briefing statement urls 2018 / 2019
1790 urls for all news


In [2]:
import re
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime


def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()


In [7]:
import re
import time

def get_latest_allnews(last_date, sleep=1.0):
    """
    Artuments
    ---------
    last_date : Date
    sleep : float
        Sleep time. Default 1.0 sec
    """

    raise NotImplemented

patterns_transcript = [
    re.compile('https://www.state.gov/[\w]+')]
base_url = 'https://www.state.gov/r/pa/prs/ps/{}/index.htm'

def get_allnews_urls(begin_year=2018, end_year=2019, verbose=True):
    """
    Arguments
    ---------
    begin_page : int
        Default is 1
    end_page : int
        Default is 3
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls
    """

    links_all = []
    for year in range(begin_year, end_year):
        url = base_url.format(year)
        soup = get_soup(url)
        sub_links = soup.find('div', class_= 'l-wrap')
        for link in sub_links.find_all("a"):
            if 'href' in link.attrs:
                 links_all += [link.attrs['href']]
        if verbose:
            print('get briefing statement urls {} / {}'.format(begin_year, end_year))

    links_all = ['https://www.state.gov' + i for i in links_all]

    return links_all

def get_last_page_num():
    """
    Returns
    -------
    page : int
        Last page number.
        eg: 503 in 'https://dod.defense.gov/News/Transcripts/?Page=62'
    """
    raise NotImplemented


In [8]:
import re
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime

def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists with
            title : article title
            time : article written time
            content : text with line separator \\n
            url : web page url
            scrap_time : scrapped time
    """

    try:
        soup = get_soup(url)
        title = soup.find('h2', class_= 'title left').text
        time = soup.find('div', id= 'date_long').text
        content = soup.find('div', id = 'centerblock').text

        json_object = {
            'title' : title,
            'time' : time,
            'content' : content,
            'url' : url,
            'scrap_time' : now()
        }
        return json_object
    except Exception as e:
        print(e)
        print('Parsing error from {}'.format(url))
        return None


In [10]:
def pprint(json_object):
    for k, v in json_object.items():
        print('{} : {} ..'.format(k, str(v)[:100]))
    print('\n')

SLEEP = 0.5

for url in DoS_urls[:3]:
    json_object = parse_page(url)
    pprint(json_object)

title : Attack in Giza ..
time : December 28, 2018 ..
content : 
The United States strongly condemns the attack carried out today on a tourist bus in Giza. Our deep ..
url : https://www.state.gov/r/pa/prs/ps/2018/12/288285.htm ..
scrap_time : 2019-01-13 23:49:58 ..


title : Israel's Right to Self-Defense ..
time : December 28, 2018 ..
content : 
The United States fully supports Israel’s right to defend itself against Iranian regional actions t ..
url : https://www.state.gov/r/pa/prs/ps/2018/12/288282.htm ..
scrap_time : 2019-01-13 23:49:58 ..


title : Secretary Pompeo Travels to Brazil and Colombia To Strengthen Prosperity, Security, and Democracy ..
time : December 28, 2018 ..
content : 
Secretary Pompeo is leading a Presidential Delegation to Brazil for the inauguration of President-e ..
url : https://www.state.gov/r/pa/prs/ps/2018/12/288281.htm ..
scrap_time : 2019-01-13 23:49:58 ..


