# Selenium news scrape
Let's try to collect news data from an internet database.

In [1]:
import selenium

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Firefox()
driver.get("http://www.python.org")
assert "Python" in driver.title

In [4]:
# proquest_command_file = '/Users/istewart/Documents/tools/selenium/proquest_search.side'
import json
user_cred_file = '../../data/umich_cred.json'
user_cred = json.load(open(user_cred_file, 'r'))

In [6]:
driver = webdriver.Firefox()
target_site = 'https://search-proquest-com.proxy.lib.umich.edu/advanced?accountid=14667'
driver.get(target_site)

In [10]:
article_title = "dr. fauci sees 'terribly painful' months ahead"
news_publisher_title = 'new york times'
text_field_1 = driver.find_element_by_id('queryTermField')
text_field_1.send_keys(article_title)
text_field_2 = driver.find_element_by_id('queryTermField_0')
text_field_2.send_keys(news_publisher_title)

In [15]:
## specify date
from selenium.webdriver.support.ui import Select
date = 'November 20 2020'
date_month, date_day, date_year = date.split(' ')
date_menu = Select(driver.find_element_by_id('select_multiDateRange'))
date_menu.select_by_visible_text('On this date...')
month_date_menu = Select(driver.find_element_by_id('month2'))
day_date_menu = Select(driver.find_element_by_id('day2'))
year_input = driver.find_element_by_id('year2')
month_date_menu.select_by_visible_text(date_month)
day_date_menu.select_by_visible_text(date_day)
year_input.send_keys(date_year)

In [16]:
## submit query!!
submit_button = driver.find_element_by_id('searchToResultPage')
submit_button.click()

In [26]:
## get first result with full text
result_item_txt_link = None
result_item_list = driver.find_element_by_class_name('resultItems')
result_items = driver.find_elements_by_id('mlditem1')
for result_item in result_items:
    result_item_txt_link = result_item.find_element_by_id('addFlashPageParameterformat_fulltext')
    if(result_item_txt_link is not None):
        break
print(f'recovered link {result_item_txt_link}')
if(result_item_txt_link is not None):
    result_item_txt_link.click()

recovered link <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="9a6eefe2-1037-cc49-b9a6-d29390d6f1fe", element="098580cc-dcd6-7f4a-8c73-90f6f2110224")>


In [56]:
## get page content
# article id
import re
article_ID_matcher = re.compile('(?<=fulltext/)[0-9A-Za-z]+(?=/)')
article_ID = article_ID_matcher.search(driver.current_url).group(0)
# article title
result_title = driver.find_element_by_id('documentTitle')
result_title_txt = result_title.text
# article authors
result_authors = driver.find_element_by_class_name('titleAuthorETC')
result_author_links = result_authors.find_elements_by_css_selector('a')
result_author_txt = list(map(lambda x: x.text, result_author_links))
# article text
result_text_zone = driver.find_element_by_id('fullTextZone')
result_text_paragraphs = result_text_zone.find_elements_by_css_selector('p')
result_paragraph_text = ' '.join(list(map(lambda x: x.text, result_text_paragraphs)))
## combine, write to file
import pandas as pd
import os
result_df = pd.DataFrame([article_ID, result_title_txt, result_author_txt, result_paragraph_text], index=['id', 'title', 'authors', 'text']).transpose()
out_dir = '../../data/NYT_scrape/'
if(not os.path.exists(out_dir)):
    os.mkdir(out_dir)
out_file = os.path.join(out_dir, f'{article_ID}_data.tsv')
result_df.to_csv(out_file, sep='\t', index=False)
# print(result_df)
# print(result_text_zone.find_elements_by('p'))

### Run scraping on several articles
Now that we've gotten scraping "right", let's try to run it on some sample NYT articles.

In [49]:
from selenium.webdriver.support.ui import Select
import re
import pandas as pd
import os
import time
from datetime import datetime, timedelta
def scrape_article(article_title, article_date, article_publisher, 
                   target_site, driver,
                   RESULT_LOAD_TIME=15):
#     print(f'getting target site {target_site}')
    # set default article data
    article_ID = None
    result_title_txt = None
    result_author_txt = None
    result_paragraph_text = None
    driver.get(target_site)
    # let site load
    site_load_time_const = RESULT_LOAD_TIME / 3.
    site_load_time = site_load_time_const + np.random.random()*(site_load_time_const)
    sleep(site_load_time)
#     article_title = "dr. fauci sees 'terribly painful' months ahead"
    ## specify title and publication
    # set title
    text_field_1 = driver.find_element_by_id('queryTermField')
    text_field_1.clear()
    text_field_1.send_keys(article_title)
    text_selection_menu_1 = Select(driver.find_element_by_id('fieldsSelect'))
    text_selection_menu_1.select_by_value('ti')
    # set publication
    text_field_2 = driver.find_element_by_id('queryTermField_0')
    text_field_2.clear()
    text_field_2.send_keys(article_publisher)
    text_selection_menu_2 = Select(driver.find_element_by_id('fieldsSelect_0'))
    text_selection_menu_2.select_by_value('pub')
    ## specify date
    ## date range: [publish date, publish date + X]
    # compute end date
    MAX_DATE_DAYS = 2
    date_fmt = '%B %d %Y'
    article_date_time = datetime.strptime(article_date, date_fmt)
    end_date = article_date_time + timedelta(days=MAX_DATE_DAYS)
    end_date_str = datetime.strftime(end_date, date_fmt)
    ## extract from date format: November 20 2020
    start_date_month, start_date_day, start_date_year = article_date.split(' ')
    end_date_month, end_date_day, end_date_year = end_date_str.split(' ')
    # fix day format
    start_date_day = str(int(start_date_day))
    end_date_day = str(int(end_date_day))
    # get date menus
    date_menu = Select(driver.find_element_by_id('select_multiDateRange'))
#     date_menu.select_by_visible_text('On this date...')
    date_menu.select_by_value('RANGE')
    # start date
    start_month_date_menu = Select(driver.find_element_by_id('month2'))
    start_day_date_menu = Select(driver.find_element_by_id('day2'))
    start_year_input = driver.find_element_by_id('year2')
    start_month_date_menu.select_by_visible_text(start_date_month)
    start_day_date_menu.select_by_visible_text(start_date_day)
    start_year_input.send_keys(start_date_year)
    # end date
    end_month_date_menu = Select(driver.find_element_by_id('month2_0'))
    end_day_date_menu = Select(driver.find_element_by_id('day2_0'))
    end_year_input = driver.find_element_by_id('year2_0')
    end_month_date_menu.select_by_visible_text(end_date_month)
    end_day_date_menu.select_by_visible_text(end_date_day)
    end_year_input.send_keys(end_date_year)
    ## submit query!!
    submit_button = driver.find_element_by_id('searchToResultPage')
    submit_button.click()
    sleep(RESULT_LOAD_TIME)
    ## get first result with full text
    # if bad search, skip to next article
    result_item_txt_link = None
    result_item_list = None
    try:
        result_item_list = driver.find_element_by_class_name('resultItems')
#         print(f'result item list {result_item_list}')
    except Exception as e:
        print(f'error {e}')
        pass
    no_results = result_item_list is None
    if(not no_results):
        result_items = driver.find_elements_by_id('mlditem1')
        for result_item in result_items:
            result_item_txt_link = None
            try:
                result_item_txt_link = result_item.find_element_by_id('addFlashPageParameterformat_fulltext')
            except Exception as e:
                pass
            if(result_item_txt_link is not None):
                break
#         print(f'recovered link {result_item_txt_link}')
        if(result_item_txt_link is not None):
            result_item_txt_link.click()
            # wait to load
            sleep(RESULT_LOAD_TIME)
            ## get page content
            # article id
            article_ID_matcher = re.compile('(?<=fulltext/)[0-9A-Za-z]+(?=/)')
            print(f'extracting ID from URL {driver.current_url}')
            article_ID = article_ID_matcher.search(driver.current_url).group(0)
            # article title
            result_title = driver.find_element_by_id('documentTitle')
            result_title_txt = result_title.text
            # article authors
            result_authors = driver.find_element_by_class_name('titleAuthorETC')
            result_author_links = result_authors.find_elements_by_css_selector('a')
            result_author_txt = list(map(lambda x: x.text, result_author_links))
            # article text
            result_text_zone = driver.find_element_by_id('fullTextZone')
            result_text_paragraphs = result_text_zone.find_elements_by_css_selector('p')
            result_paragraph_text = ' '.join(list(map(lambda x: x.text, result_text_paragraphs)))
    ## combine, write to file
    result_df = pd.DataFrame([article_ID, result_title_txt, result_author_txt, result_paragraph_text], index=['id', 'title', 'authors', 'text']).transpose()
    return result_df
#     out_dir = '../../data/NYT_scrape/'
#     if(not os.path.exists(out_dir)):
#         os.mkdir(out_dir)
#     out_file = os.path.join(out_dir, f'{article_ID}_data.tsv')
#     result_df.to_csv(out_file, sep='\t', index=False)
    # print(result_df)
    # print(result_text_zone.find_elements_by('p'))
from time import sleep
def scrape_write_article(article_title, article_date, article_publisher, 
                         original_article_id, target_site, driver, out_dir):
    """
    Scrape article data and write to file.
    """
    result_data = scrape_article(article_title, article_date, article_publisher, target_site, driver)
    out_file = os.path.join(out_dir, f'article_{original_article_id}.tsv')
    result_data.to_csv(out_file, sep='\t', index=False)
import numpy as np
def scrape_write_all_articles(article_data, article_publisher, target_site, driver, out_dir, SLEEP_TIME=15, verbose=True):
    """
    Scrape and write all articles to file. Sleep between scrapes.
    """
    rand_sleep_time_scale = SLEEP_TIME / 3.
    # first thing: login
    driver.get(target_site)
    LOGIN_TIME=30
    login_time_i = LOGIN_TIME + np.random.random() * (LOGIN_TIME / 10)
    time.sleep(login_time_i)
    for i, (idx_i, data_i) in enumerate(article_data.iterrows()):
        article_title_i = data_i.loc['title']
        article_date_i = data_i.loc['date']
        article_id_i = data_i.loc['articleID']
        out_file = os.path.join(out_dir, f'article_{article_id_i}.tsv')
        if(not os.path.exists(out_file)):
            if(verbose):
                print(f'mining article {article_id_i}')
            scrape_write_article(article_title_i, article_date_i, article_publisher, 
                                 article_id_i, target_site, driver, out_dir)
            sleep_time_i = SLEEP_TIME + np.random.random() * (rand_sleep_time_scale)
            sleep(sleep_time_i)

In [56]:
# load comment data
import pandas as pd
# comment_data = pd.read_csv('../../data/nyt_comments/CommentsApril2018.csv', sep=',', index_col=False, usecols=['articleID', ''])
article_data = pd.read_csv('../../data/nyt_comments/ArticlesApril2018.csv', sep=',', index_col=False, usecols=['articleID', 'headline', 'pubDate'])
article_data.rename(columns={'headline' : 'title'}, inplace=True)
article_data = article_data[article_data.loc[:, 'title'] != 'Unknown']
print('%d articles'%(article_data.shape[0]))
# simplify date
from datetime import datetime
date_fmt = '%Y-%m-%d %H:%M:%S'
article_data = article_data.assign(**{
    'date_time' : article_data.loc[:, 'pubDate'].apply(lambda x: datetime.strptime(x, date_fmt))
})
clean_date_fmt = '%B %d %Y'
article_data = article_data.assign(**{
    'date' : article_data.loc[:, 'date_time'].apply(lambda x: datetime.strftime(x, clean_date_fmt))
})
# get sample to mine
sample_size = 1000
sample_article_data = article_data.head(sample_size)
display(sample_article_data.head())

1214 articles


Unnamed: 0,articleID,title,pubDate,date_time,date
0,5adf6684068401528a2aa69b,Former N.F.L. Cheerleaders’ Settlement Offer: ...,2018-04-24 17:16:49,2018-04-24 17:16:49,April 24 2018
1,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,2018-04-24 17:11:21,2018-04-24 17:11:21,April 24 2018
2,5adf4626068401528a2aa628,"The New Noma, Explained",2018-04-24 14:58:44,2018-04-24 14:58:44,April 24 2018
8,5adf2108068401528a2aa5b3,How a Bag of Texas Dirt Became a Times Tradition,2018-04-24 12:20:21,2018-04-24 12:20:21,April 24 2018
9,5adedaa8068401528a2aa4e6,Is School a Place for Self-Expression?,2018-04-24 11:21:04,2018-04-24 11:21:04,April 24 2018


In [None]:
target_site = 'https://search-proquest-com.proxy.lib.umich.edu/advanced?accountid=14667'
article_publisher = 'new york times'
from selenium.webdriver import Firefox
driver = Firefox()
out_dir = '../../data/NYT_scrape/'
# driver.get(target_site)
scrape_write_all_articles(sample_article_data, article_publisher, target_site, driver, out_dir)

How many of the sample articles were we able to recover? This will give us an (imperfect) estimate of the overall coverage.

In [55]:
import pandas as pd
import numpy as np
out_dir = '../../data/NYT_scrape/'
article_text_files = list(map(lambda x: os.path.join(out_dir, x), os.listdir(out_dir)))
article_text_data = pd.concat(list(map(lambda x: pd.read_csv(x, sep='\t', index_col=False), article_text_files)), axis=0)
valid_article_text_data = article_text_data[~article_text_data.loc[:, 'id'].apply(lambda x: type(x) is not str and np.isnan(x))]
print(f'{valid_article_text_data.shape[0]}/{article_text_data.shape[0]} valid articles')

437/504 valid articles


OK! So we get ~85% recall which is impressive considering that headlines change so often in news.