# Web Scraper Tool for 5 US Media Outlets

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [252]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [253]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [254]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [255]:
# setup
number_of_articles = min(len(breitbart_tags), 30)

breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

In [256]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [257]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [258]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,A report from the Swedish Defence Research Ins...,Report: Sweden May Be Most ‘Incel’ Country in ...,2020-03-08,https://www.breitbart.com/europe/2020/03/08/re...,Breitbart
1,The American Conservative Union (ACU) announce...,ACU: CPAC Attendee Tested Positive for Coronav...,2020-03-07,https://www.breitbart.com/politics/2020/03/07/...,Breitbart
2,Former Illinois Gov. Rod Blagojevich ripped in...,Exclusive — Rod Blagojevich Explains Why He Is...,2020-03-07,https://www.breitbart.com/politics/2020/03/07/...,Breitbart
3,Politico continued the media’s effort to trash...,Pollak: Politico Continues Media’s Quest to Bl...,2020-03-07,https://www.breitbart.com/the-media/2020/03/07...,Breitbart
4,"On Friday’s “PBS NewsHour,” columnist Mark Shi...",Shields: Nobody Understands ‘What a Biden Pres...,2020-03-07,https://www.breitbart.com/clips/2020/03/07/shi...,Breitbart


In [259]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')
num_old = len(old_breitbart_data)

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)
num_now = len(breitbart_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 14
total number of entries in new data: 30


### 2. Fox - Conservative

In [454]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [455]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [456]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [457]:
# setup
fox_links = []
fox_text = []
fox_titles = []
fox_dates = []

In [458]:
number_of_articles = 30

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [459]:
# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [460]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Fox,2020-03-09T162213-0400,https://foxnews.com/politics/bernie-sanders-fo...,Bernie Sanders,Democratic president...
1,Fox,2020-03-09T155153-0400,https://foxnews.com/politics/gop-rep-doug-coro...,Two GOP reps self-quarantine after co...,'Dr. Oz Show' host D...
2,Fox,2020-03-09T190439-0400,https://foxnews.com/politics/trump-congress-pa...,Trump to pitch Congress on payroll ta...,Dr. Mehmet Oz descri...
3,Fox,2020-03-10T044655-0400,https://foxnews.com/politics/us-begins-withdra...,US begins withdrawing troops from Afg...,The United States b...
4,Fox,2020-03-09T231941-0400,https://foxnews.com/politics/sanders-fights-fo...,Sanders fights for another rust belt ...,Democratic president...


In [461]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)

In [462]:
# see number of articles
print(len(fox_data))

34


### 3. Associated Press - Neutral

In [445]:
# load the HTML content using requests and save into a variable
ap_requests = requests.get('https://apnews.com/apf-politics')
ap_homepage = ap_requests.content

In [446]:
# create a soup to allow BeautifulSoup to work
ap_soup = BeautifulSoup(ap_homepage, 'html.parser')

In [447]:
# locate articles
ap_tags = ap_soup.find_all('a', class_="Component-headline-0-2-105")

In [448]:
# setup
number_of_articles = min(len(ap_tags), 30)

ap_links = []
ap_text = []
ap_titles = []
ap_dates = []

In [449]:
# get homepage article links
for link in ap_tags:
    link = link.get('href')
    link = "https://apnews.com" + link
    ap_links.append(link)

In [450]:
# prep for article content
for link in ap_links:
    ap_article_request = requests.get(link)
    ap_article = ap_article_request.content
    ap_article_soup = BeautifulSoup(ap_article, 'html.parser')
    
    # article titles
    title = ap_article_soup.find_all('meta')[14]
    title = title['content']
    ap_titles.append(title)
    
    # article date
    date = ap_article_soup.find_all('meta')[24]
    date = date['content']
    ap_dates.append(date)
    
    # article content: <div class="Article" data-key=Article.
    body = ap_article_soup.find_all('div')
    x = body[0].find_all('p')

    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
    ap_text.append(final_article)

In [451]:
# join ap data
ap_data = pd.DataFrame.from_dict({
    'publisher': 'AP',
    'date': ap_dates,
    'link': ap_links,
    'article_title': ap_titles,
    'article_text': ap_text 
})

ap_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,AP,2020-03-10T04:15:12Z,https://apnews.com/8d90a37e5e5b177d08424fd14a2...,'Odd' quirk raises delegate stakes in Tuesday'...,WASHINGTON (AP) — A quirk in how delegates are...
1,AP,2020-03-10T04:35:05Z,https://apnews.com/5b4fa2256d232ca24b2f93fff31...,Michigan primary could make or break Sanders' ...,DETROIT (AP) — Bernie Sanders proved his 2016 ...
2,AP,2020-03-10T00:05:20Z,https://apnews.com/a5e4261f40050e07ad3b6b9eb04...,Trump talks down virus as his properties face ...,NEW YORK (AP) — One of President Donald Trump’...
3,AP,2020-03-09T16:09:06Z,https://apnews.com/2e8a815a031e8da37075feec466...,"US begins troop withdrawal from Afghanistan, o...",WASHINGTON (AP) — American troops have begun l...
4,AP,2020-03-10T04:41:42Z,https://apnews.com/896611803373610849fe8602b64...,Trump plans payroll tax relief in response to ...,WASHINGTON (AP) — President Donald Trump says ...


In [453]:
# read in old data
old_ap_data = pd.read_csv('data/ap_data.csv')

# append new data
ap_data = old_ap_data.append(ap_data).drop_duplicates()

# save new .csv
ap_data.to_csv("data/ap_data.csv", index = False)

In [463]:
# see number of articles
print(len(ap_data))

48


### 4. New York Times - Liberal

In [266]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [267]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [356]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [104]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [105]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', {'class':['css-53u6y8', 'css-1fanzo5']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [106]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', attrs = {'class':['css-53u6y8', 'css-1fanzo5 StoryBodyCompanionColumn']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [107]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [108]:
# make sure it looks nice
nyt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,"WEST PALM BEACH, Fla. — President Trump on Fri...","Trump Names Mark Meadows Chief of Staff, Ousti...",2020-03-07,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
1,President Trump claimed again on Friday that a...,"With Test Kits in Short Supply, Health Officia...",2020-03-07,https://www.nytimes.com/2020/03/06/health/test...,new_york_times
2,Bernie Sanders was several takes into a video ...,The Bernie Sanders Personality Test,2020-03-06,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
3,Joseph R. Biden Jr.’s campaign organization in...,Joe Biden Has Had Flimsy Organization. It Hasn...,2020-03-06,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
4,"FLINT, Mich. — Cornel West pleaded with his “o...",Sanders Is Behind With Black Voters. He Didn’t...,2020-03-08,https://www.nytimes.com/2020/03/08/us/politics...,new_york_times


In [110]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')
num_old = len(old_nyt_data)

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)
num_now = len(nyt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 14
total number of entries in new data: 14


### 5. Buzzfeed - Very Liberal

In [3]:
# load the HTML content using requests and save into a variable
buzz_request = requests.get('https://www.buzzfeednews.com/section/politics')
buzz_homepage = buzz_request.content

In [4]:
# create soup 
buzz_soup = BeautifulSoup(buzz_homepage, 'html.parser')

In [5]:
# locate article URLs
buzz_tags = buzz_soup.find_all('h2')

In [88]:
# setup
number_of_articles = min(len(buzz_tags), 30)

# get article titles, content, and links
buzz_links = []
buzz_titles = []
buzz_dates = []
buzz_contents = []

In [89]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = buzz_tags[n].find('a')['href']
    buzz_links.append(link)
    
    # get article title
    title = buzz_tags[n].find('a').get_text()
    buzz_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.find_all('div', class_="news-article-header__timestamps")    
    date = " ".join([item.text for item in date]).replace('\n', '')
    buzz_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', attrs={'data-module':'subbuzz-text'})
    article = " ".join([item.text for item in body]).replace('\n', '')
    final_article = re.sub(r' {[^}]*}', '', article)
        
    buzz_contents.append(final_article)

In [90]:
# assembling data
buzz_data = pd.DataFrame.from_dict({
    'publisher': 'buzzfeed',
    'date': buzz_dates,
    'link': buzz_links,
    'article_title': buzz_titles,
    'article_text': buzz_contents 
})

In [94]:
buzz_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,"FLINT, Mich. — It was a rare move for the...",Bernie Sanders Scrapped A Planned Speech On Ra...,"Posted on March 7, 2020, at 10:18 ...",https://www.buzzfeednews.com/article/rubycrame...,buzzfeed
1,"NEW DELHI — A tweet by Tulsi Gabbard, the...",Tulsi Gabbard's Tweet About Anti-Hinduism In T...,"Posted on March 6, 2020, at 12:11 ...",https://www.buzzfeednews.com/article/nishitajh...,buzzfeed
2,The image began to circulate again on Thu...,What Happened To The Women Of 2020? Fears Of “...,"Posted on March 5, 2020, at 6:35 p...",https://www.buzzfeednews.com/article/mollyhens...,buzzfeed
3,WASHINGTON — Congress will send President...,Congress Quickly Passed An $8.3 Billion Spendi...,"Posted on March 5, 2020, at 2:23 p...",https://www.buzzfeednews.com/article/kadiagoba...,buzzfeed
4,Three weeks after ending his presidential...,Andrew Yang Is Launching A Political Group To ...,"Posted on March 5, 2020, at 11:11 ...",https://www.buzzfeednews.com/article/rubycrame...,buzzfeed


In [95]:
# read in old data
old_buzz_data = pd.read_csv('data/buzzfeed_data.csv')
num_old = len(old_buzz_data)

# append new data
buzz_data = old_buzz_data.append(buzz_data).drop_duplicates()

# save new .csv
buzz_data.to_csv("data/buzzfeed_data.csv", index = False)
num_now = len(buzz_data)

#print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

total number of entries in new data: 30
