# Web Scraper Tool for 5 US Media Outlets

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [20]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [21]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [22]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [23]:
# setup
number_of_articles = min(len(breitbart_tags), 30)

breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

In [24]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [25]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [26]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Breitbart,2020-03-12,https://www.breitbart.com/middle-east/2020/03/...,Israel Slams Russian FM for Meeting with Islam...,TEL AVIV – Russia’s top diplomat met with the ...
1,Breitbart,2020-03-12,https://www.breitbart.com/politics/2020/03/12/...,Joe Biden to Fight Coronavirus by Rejoining Pa...,Former Vice President Joe Biden proposed his p...
2,Breitbart,2020-03-12,https://www.breitbart.com/politics/2020/03/12/...,NY Gov. Andrew Cuomo Bans Gatherings of over 5...,New York Gov. Andrew Cuomo (D) on Thursday ann...
3,Breitbart,2020-03-12,https://www.breitbart.com/politics/2020/03/12/...,"Tammy Duckworth, Doug Jones Praise Trump’s E.U...",A pair of Democrat senators offered rare prais...
4,Breitbart,2020-03-12,https://www.breitbart.com/politics/2020/03/12/...,New York City Declares State of Emergency Over...,New York City Mayor Bill de Blasio (D) on Thur...


In [27]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')
num_old = len(old_breitbart_data)

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)
num_now = len(breitbart_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 120
total number of entries in new data: 150


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


### 2. Fox - Conservative

In [2]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [3]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [4]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [5]:
# setup
fox_links = []
fox_text = []
fox_titles = []
fox_dates = []

In [6]:
number_of_articles = 30

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [7]:
# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [8]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Fox,2020-03-12T121511-0400,https://foxnews.com/politics/trump-shrugs-off-...,Trump shrugs off EU anger over corona...,Dr. Janette Nesheiwa...
1,Fox,2020-03-12T135537-0400,https://foxnews.com/politics/biden-campaign-ma...,Biden campaign shakeup Veteran of Beto,Joe Biden lays out h...
2,Fox,2020-03-12T101059-0400,https://foxnews.com/politics/coronavirus-impac...,Coronavirus impact on federal governm...,World Health Organiz...
3,Fox,2020-03-12T150522-0400,https://foxnews.com/politics/supreme-court-clo...,Supreme Court closes to the public am...,The coronavirs and i...
4,Fox,2020-03-12T093115-0400,https://foxnews.com/politics/march-17-primarie...,March 17 primaries Here are the state...,Fox News contributor...


In [9]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')
num_old = len(old_fox_data)

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)
num_now = len(fox_data)

In [10]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 50
total number of entries in new data: 67
difference: 17


### 3. Associated Press - Neutral

In [11]:
# load the HTML content using requests and save into a variable
ap_requests = requests.get('https://apnews.com/apf-politics')
ap_homepage = ap_requests.content

In [12]:
# create a soup to allow BeautifulSoup to work
ap_soup = BeautifulSoup(ap_homepage, 'html.parser')

In [13]:
# locate articles
ap_tags = ap_soup.find_all('a', class_="Component-headline-0-2-105")

In [14]:
# setup
number_of_articles = min(len(ap_tags), 30)

ap_links = []
ap_text = []
ap_titles = []
ap_dates = []

In [15]:
# get homepage article links
for link in ap_tags:
    link = link.get('href')
    link = "https://apnews.com" + link
    ap_links.append(link)

In [16]:
# prep for article content
for link in ap_links:
    ap_article_request = requests.get(link)
    ap_article = ap_article_request.content
    ap_article_soup = BeautifulSoup(ap_article, 'html.parser')
    
    # article titles
    title = ap_article_soup.find_all('meta')[14]
    title = title['content']
    ap_titles.append(title)
    
    # article date
    date = ap_article_soup.find_all('meta')[24]
    date = date['content']
    ap_dates.append(date)
    
    # article content: <div class="Article" data-key=Article.
    body = ap_article_soup.find_all('div')
    x = body[0].find_all('p')

    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        paragraph = paragraph.replace('CHICAGO (AP) -',"")
        paragraph = paragraph.replace('DETROIT (AP) -',"")
        paragraph = paragraph.replace('WASHINGTON (AP) -',"")
        paragraph = paragraph.replace('___ Catch up on the 2020 election campaign with AP experts on our weekly politics podcast, “Ground Game.',"")
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
    ap_text.append(final_article)

In [17]:
# join ap data
ap_data = pd.DataFrame.from_dict({
    'publisher': 'AP',
    'date': ap_dates,
    'link': ap_links,
    'article_title': ap_titles,
    'article_text': ap_text 
})

ap_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,AP,2020-03-12T18:28:26Z,https://apnews.com/5878d878b1c1b5c3274c529ed2b...,Trump economic team grasps for credibility wit...,WASHINGTON (AP) — During the financial crisis ...
1,AP,2020-03-12T04:56:31Z,https://apnews.com/6052e677cfc38b76224eccddb9f...,Washington strains for virus response as insti...,WASHINGTON (AP) — Washington is straining for ...
2,AP,2020-03-12T16:46:03Z,https://apnews.com/f20e530ae6dfe4926f81a119054...,Brazilian who met Trump has virus; no plans to...,WASHINGTON (AP) — A senior Brazilian official ...
3,AP,2020-03-12T17:05:37Z,https://apnews.com/4ab8fb97f6434dfd442bb2c8c3f...,Debate moves from Phoenix to DC over coronavir...,WASHINGTON (AP) — The Democratic National Comm...
4,AP,2020-03-12T15:57:45Z,https://apnews.com/f6d9c0b8d9504b5708a89718e56...,"Biden, Sanders offer contrasts to Trump during...","WILMINGTON, Del. (AP) — Democratic presidentia..."


In [18]:
# read in old data
old_ap_data = pd.read_csv('data/ap_data.csv')
num_old = len(old_ap_data)

# append new data
ap_data = old_ap_data.append(ap_data).drop_duplicates()

# save new .csv
ap_data.to_csv("data/ap_data.csv", index = False)
num_now = len(ap_data)

In [19]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 66
total number of entries in new data: 115
difference: 49


### 4. New York Times - Liberal

In [34]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [35]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [36]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [37]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [38]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', {'class':['css-53u6y8', 'css-1fanzo5']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [39]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', attrs = {'class':['css-53u6y8', 'css-1fanzo5 StoryBodyCompanionColumn']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

KeyboardInterrupt: 

In [None]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [None]:
# make sure it looks nice
nyt_data.head()

In [None]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')
num_old = len(old_nyt_data)

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)
num_now = len(nyt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

### 5. Buzzfeed - Very Liberal

In [40]:
# load the HTML content using requests and save into a variable
buzz_request = requests.get('https://www.buzzfeednews.com/section/politics')
buzz_homepage = buzz_request.content

In [41]:
# create soup 
buzz_soup = BeautifulSoup(buzz_homepage, 'html.parser')

In [42]:
# locate article URLs
buzz_tags = buzz_soup.find_all('h2')

In [43]:
# setup
number_of_articles = min(len(buzz_tags), 30)

# get article titles, content, and links
buzz_links = []
buzz_titles = []
buzz_dates = []
buzz_contents = []

In [44]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = buzz_tags[n].find('a')['href']
    buzz_links.append(link)
    
    # get article title
    title = buzz_tags[n].find('a').get_text()
    buzz_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.find_all('div', class_="news-article-header__timestamps")    
    date = " ".join([item.text for item in date]).replace('\n', '')
    buzz_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', attrs={'data-module':'subbuzz-text'})
    article = " ".join([item.text for item in body]).replace('\n', '')
    final_article = re.sub(r' {[^}]*}', '', article)
        
    buzz_contents.append(final_article)

In [45]:
# assembling data
buzz_data = pd.DataFrame.from_dict({
    'publisher': 'buzzfeed',
    'date': buzz_dates,
    'link': buzz_links,
    'article_title': buzz_titles,
    'article_text': buzz_contents 
})

In [46]:
buzz_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,buzzfeed,"Posted on March 12, 2020, at 4:46 ...",https://www.buzzfeednews.com/article/paulmcleo...,The Trump Administration Will Move Ahead With ...,WASHINGTON — The Trump administration is ...
1,buzzfeed,"Posted on March 12, 2020, at 2:51 ...",https://www.buzzfeednews.com/article/kadiagoba...,Members Of Congress Are Furious At The Lack Of...,WASHINGTON — A day after the World Health...
2,buzzfeed,"Posted on March 12, 2020, at 2:44 ...",https://www.buzzfeednews.com/article/henrygome...,Joe Biden’s Coronavirus Speech And Campaign Sh...,Joe Biden is looking past the remaining p...
3,buzzfeed,"Last updated on March 12, 2020, at...",https://www.buzzfeednews.com/article/matthewch...,A Brazilian Official Who Met Trump At Mar-A-La...,Brazilian President Jair Bolsonaro's pres...
4,buzzfeed,"Posted on March 11, 2020, at 5:06 ...",https://www.buzzfeednews.com/article/ryanmac/c...,Secret Users Of Clearview AI’s Facial Recognit...,"On a flight to Boston in January, James, ..."


In [47]:
# read in old data
old_buzz_data = pd.read_csv('data/buzzfeed_data.csv')
num_old = len(old_buzz_data)

# append new data
buzz_data = old_buzz_data.append(buzz_data).drop_duplicates()

# save new .csv
buzz_data.to_csv("data/buzzfeed_data.csv", index = False)
num_now = len(buzz_data)

#print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

total number of entries in new data: 50
