# Web Scraper Tool for 5 US Media Outlets

In [48]:
import requests
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [49]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [50]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [51]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [52]:
# setup
number_of_articles = min(len(breitbart_tags), 30)

breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

In [53]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [54]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [55]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Breitbart,2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Police: Andrew Gillum Involved in Suspected Cr...,Failed Florida gubernatorial candidate Andrew ...
1,Breitbart,2020-03-13,https://www.breitbart.com/clips/2020/03/13/dr-...,Dr. Fauci on Coronavirus: ‘It Absolutely Came ...,"During a Friday interview on “Fox & Friends,” ..."
2,Breitbart,2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Job Creators Network Runs ‘Small Business Is T...,Job Creators Network (JCN) ran a full-page ad ...
3,Breitbart,2020-03-13,https://www.breitbart.com/economy/2020/03/13/c...,Coronavirus: Mnuchin Floats Pause on Student L...,"Appearing Friday on CNBC, Treasury Secretary S..."
4,Breitbart,2020-03-13,https://www.breitbart.com/latin-america/2020/0...,Update: Brazil’s Jair Bolsonaro Tests Negative...,Update: Brazilian President Jair Bolsonaro say...


In [56]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')
num_old = len(old_breitbart_data)

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)
num_now = len(breitbart_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 150
total number of entries in new data: 180


### 2. Fox - Conservative

In [57]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [58]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [59]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [60]:
# setup
fox_links = []
fox_text = []
fox_titles = []
fox_dates = []

In [61]:
number_of_articles = 30

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [62]:
# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [63]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Fox,2020-03-13T062329-0400,https://foxnews.com/politics/pelosi-telegraphs...,Pelosi telegraphs third coronavirus f...,Reaction and analysi...
1,Fox,2020-03-13T082906-0400,https://foxnews.com/politics/dem-rep-katie-por...,Dem Rep. Katie Porter,Rep. Michael C. Burg...
2,Fox,2020-03-13T083949-0400,https://foxnews.com/politics/gop-senator-subpo...,GOP senator plans to subpoena consult...,GOP senators ramp up...
3,Fox,2020-03-13T094443-0400,https://foxnews.com/politics/joe-bidens-corona...,Joe Biden's coronavirus plan What's i...,Democratic president...
4,Fox,2020-03-13T110950-0400,https://foxnews.com/politics/trump-to-brief-pr...,Trump to hold press conference on cor...,President Trump says...


In [64]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')
num_old = len(old_fox_data)

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)
num_now = len(fox_data)

In [65]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 67
total number of entries in new data: 81
difference: 14


### 3. Associated Press - Neutral

In [66]:
# load the HTML content using requests and save into a variable
ap_requests = requests.get('https://apnews.com/apf-politics')
ap_homepage = ap_requests.content

In [67]:
# create a soup to allow BeautifulSoup to work
ap_soup = BeautifulSoup(ap_homepage, 'html.parser')

In [68]:
# locate articles
ap_tags = ap_soup.find_all('a', class_="Component-headline-0-2-105")

In [69]:
# setup
number_of_articles = min(len(ap_tags), 30)

ap_links = []
ap_text = []
ap_titles = []
ap_dates = []

In [70]:
# get homepage article links
for link in ap_tags:
    link = link.get('href')
    link = "https://apnews.com" + link
    ap_links.append(link)

In [71]:
# prep for article content
for link in ap_links:
    ap_article_request = requests.get(link)
    ap_article = ap_article_request.content
    ap_article_soup = BeautifulSoup(ap_article, 'html.parser')
    
    # article titles
    title = ap_article_soup.find_all('meta')[14]
    title = title['content']
    ap_titles.append(title)
    
    # article date
    date = ap_article_soup.find_all('meta')[24]
    date = date['content']
    ap_dates.append(date)
    
    # article content: <div class="Article" data-key=Article.
    body = ap_article_soup.find_all('div')
    x = body[0].find_all('p')

    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        paragraph = paragraph.replace('CHICAGO (AP) -',"")
        paragraph = paragraph.replace('DETROIT (AP) -',"")
        paragraph = paragraph.replace('WASHINGTON (AP) -',"")
        paragraph = paragraph.replace('___ Catch up on the 2020 election campaign with AP experts on our weekly politics podcast, “Ground Game.',"")
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
    ap_text.append(final_article)

In [72]:
# join ap data
ap_data = pd.DataFrame.from_dict({
    'publisher': 'AP',
    'date': ap_dates,
    'link': ap_links,
    'article_title': ap_titles,
    'article_text': ap_text 
})

ap_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,AP,2020-03-13T12:37:22Z,https://apnews.com/83b0c8e168548fd453b0c177dd1...,"Pelosi, White House near agreement on coronavi...",WASHINGTON (AP) — House Speaker Nancy Pelosi a...
1,AP,2020-03-13T15:22:24Z,https://apnews.com/663e745a80358c042786d2a5624...,Trump to hold news conference as he seeks to c...,WASHINGTON (AP) — President Donald Trump will ...
2,AP,2020-03-13T13:13:09Z,https://apnews.com/d7cb8610b50a2e79ce1a72929f2...,Trump administration: $1.3M for fast virus tes...,President Donald Trump’s administration announ...
3,AP,2020-03-13T12:00:17Z,https://apnews.com/0c26235718a02157e5831933f57...,Chairwoman of Congressional Black Caucus endor...,The chairwoman of the Congressional Black Cauc...
4,AP,2020-03-13T04:24:56Z,https://apnews.com/9d7d4614b0a33fb63e68ecb666b...,Florida could be knockout punch for Sanders’ 2...,MIAMI (AP) — Florida has never been known as a...


In [73]:
# read in old data
old_ap_data = pd.read_csv('data/ap_data.csv')
num_old = len(old_ap_data)

# append new data
ap_data = old_ap_data.append(ap_data).drop_duplicates()

# save new .csv
ap_data.to_csv("data/ap_data.csv", index = False)
num_now = len(ap_data)

In [74]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 115
total number of entries in new data: 135
difference: 20


### 4. New York Times - Liberal

In [75]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [76]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [77]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [78]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [79]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', {'class':['css-53u6y8', 'css-1fanzo5']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [80]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', attrs = {'class':['css-53u6y8', 'css-1fanzo5 StoryBodyCompanionColumn']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [81]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [82]:
# make sure it looks nice
nyt_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,new_york_times,2020-03-12,https://www.nytimes.com/2020/03/12/business/ec...,Congress Nears Stimulus Deal With White House ...,WASHINGTON — Financial markets plunged on Thur...
1,new_york_times,2020-03-12,https://www.nytimes.com/2020/03/12/us/politics...,The President as Bystander: Trump Struggles to...,WASHINGTON — As he confronts the most serious ...
2,new_york_times,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,How 4 Big States Are Preparing to Vote as the ...,Elections officials in the next four Democrati...
3,new_york_times,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,Afraid of Coronavirus? That Might Say Somethin...,"Welcome to Poll Watch, our weekly look at poll..."
4,new_york_times,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,Trump Administration Moves to Speed Coronaviru...,WASHINGTON — The Trump administration moved on...


In [83]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')
num_old = len(old_nyt_data)

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)
num_now = len(nyt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 41
total number of entries in new data: 55


### 5. Buzzfeed - Very Liberal

In [84]:
# load the HTML content using requests and save into a variable
buzz_request = requests.get('https://www.buzzfeednews.com/section/politics')
buzz_homepage = buzz_request.content

In [85]:
# create soup 
buzz_soup = BeautifulSoup(buzz_homepage, 'html.parser')

In [86]:
# locate article URLs
buzz_tags = buzz_soup.find_all('h2')

In [87]:
# setup
number_of_articles = min(len(buzz_tags), 30)

# get article titles, content, and links
buzz_links = []
buzz_titles = []
buzz_dates = []
buzz_contents = []

In [88]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = buzz_tags[n].find('a')['href']
    buzz_links.append(link)
    
    # get article title
    title = buzz_tags[n].find('a').get_text()
    buzz_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.find_all('div', class_="news-article-header__timestamps")    
    date = " ".join([item.text for item in date]).replace('\n', '')
    buzz_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', attrs={'data-module':'subbuzz-text'})
    article = " ".join([item.text for item in body]).replace('\n', '')
    final_article = re.sub(r' {[^}]*}', '', article)
        
    buzz_contents.append(final_article)

In [89]:
# assembling data
buzz_data = pd.DataFrame.from_dict({
    'publisher': 'buzzfeed',
    'date': buzz_dates,
    'link': buzz_links,
    'article_title': buzz_titles,
    'article_text': buzz_contents 
})

In [90]:
buzz_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,buzzfeed,"Posted on March 12, 2020, at 6:21 ...",https://www.buzzfeednews.com/article/miriameld...,Coronavirus Is The Nightmare Situation People ...,Nobody knows what to do. Is it ethical to...
1,buzzfeed,"Posted on March 12, 2020, at 6:12 ...",https://www.buzzfeednews.com/article/mollyhens...,Katie Porter Got The CDC Director To Promise F...,California Rep. Katie Porter exacted a co...
2,buzzfeed,"Posted on March 12, 2020, at 5:00 ...",https://www.buzzfeednews.com/article/ryancbroo...,Bernie Sanders Pitched Medicare For All As The...,Bernie Sanders turned a press statement o...
3,buzzfeed,"Posted on March 12, 2020, at 4:46 ...",https://www.buzzfeednews.com/article/paulmcleo...,The Trump Administration Will Move Ahead With ...,WASHINGTON — The Trump administration is ...
4,buzzfeed,"Posted on March 12, 2020, at 2:51 ...",https://www.buzzfeednews.com/article/kadiagoba...,Members Of Congress Are Furious At The Lack Of...,WASHINGTON — A day after the World Health...


In [91]:
# read in old data
old_buzz_data = pd.read_csv('data/buzzfeed_data.csv')
num_old = len(old_buzz_data)

# append new data
buzz_data = old_buzz_data.append(buzz_data).drop_duplicates()

# save new .csv
buzz_data.to_csv("data/buzzfeed_data.csv", index = False)
num_now = len(buzz_data)

#print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

total number of entries in new data: 54
