# Web Scraper Tool for US Media Outlets

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [2]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [3]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [4]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [5]:
# setup
number_of_articles = min(len(breitbart_tags), 30)

breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

In [6]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [7]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [8]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,"Friday, Rep. Matt Gaetz (R-FL) applauded Presi...",Gaetz Slams Dems for Using Coronavirus Threat ...,2020-03-14,https://www.breitbart.com/clips/2020/03/14/gae...,Breitbart
1,Nigel Farage has said that the actions of gove...,Farage on Coronavirus: ‘We Are All Nationalist...,2020-03-14,https://www.breitbart.com/europe/2020/03/14/fa...,Breitbart
2,The European Court of Human Rights (ECHR) has ...,European Human Rights Court Rules Against Chri...,2020-03-14,https://www.breitbart.com/europe/2020/03/14/eu...,Breitbart
3,UNITED NATIONS (AP) – Allegations of sexual ab...,Sexual Abuse Allegations Against UN Civilian S...,2020-03-14,https://www.breitbart.com/europe/2020/03/14/se...,Breitbart
4,The Swedish Public Health Authority has decide...,"Sweden to Stop Publishing Coronavirus Numbers,...",2020-03-14,https://www.breitbart.com/europe/2020/03/14/sw...,Breitbart


In [9]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')
num_old = len(old_breitbart_data)

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)
num_now = len(breitbart_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 180
total number of entries in new data: 210


### 2. Fox - Conservative

In [10]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [11]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [12]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [13]:
# setup
fox_links = []
fox_text = []
fox_titles = []
fox_dates = []

In [14]:
number_of_articles = 30

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [15]:
# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [16]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,President Trump and ...,Trump declares national emergency ove...,2020-03-13T140714-0400,https://foxnews.com/politics/trump-declares-na...,Fox
1,Lawmakers work on co...,House OKs coronavirus relief bill,2020-03-13T182340-0400,https://foxnews.com/politics/house-oks-coronav...,Fox
2,Democratic president...,Democracy 2020 Digest Virtual campaig...,2020-03-13T172206-0400,https://foxnews.com/politics/democracy-2020-di...,Fox
3,Why the global numbe...,De Blasio refuses to shutter NYC scho...,2020-03-14T091759-0400,https://foxnews.com/politics/de-blasio-nyc-sch...,Fox
4,Lawmakers work on co...,House OKs coronavirus relief bill,2020-03-13T182340-0400,https://foxnews.com/politics/house-oks-coronav...,Fox


In [17]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')
num_old = len(old_fox_data)

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)
num_now = len(fox_data)

In [65]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 67
total number of entries in new data: 81
difference: 14


### 3. Washington Times - Center Right

In [40]:
# load the HTML content using requests and save into a variable
wt_request = requests.get('https://www.washingtontimes.com/news/politics/')
wt_homepage = wt_request.content

In [41]:
# create soup 
wt_soup = BeautifulSoup(wt_homepage, 'html.parser')

In [42]:
# locate article URLs
wt_tags = wt_soup.find_all('h2', class_="article-headline")

In [43]:
# setup
number_of_articles = len(wt_tags)

# get article titles, content, and links
wt_links = []
wt_titles = []
wt_dates = []
wt_contents = []

In [44]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = wt_tags[n].find('a')['href']
    link = 'https://www.washingtontimes.com' + link
    wt_links.append(link)
    
    # get article title
    title = wt_tags[n].find('a').get_text()
    wt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    meta = soup_article.find("div", class_="meta").find("span", class_="source").text
    strip = meta.replace(' -\n\t\t\t\n\t\t\t\tAssociated Press\n -\n                      \n                        \n                        ', '')
    strip = strip.replace(' -\n\t\t\t\n\t\t\t\tThe Washington Times\n -\n                      \n                        \n                        ', '')
    date = strip.replace('\n                      \n                    ', '')
    wt_dates.append(date)
    
    # get article content
    for div in soup_article.find_all("div", {'class':'article-toplinks'}): 
        div.decompose()
    
    body = soup_article.find_all('div', class_= 'bigtext')  
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs).split("\n")[0]
       
    wt_contents.append(final_article)

In [45]:
# assembling data
wt_data = pd.DataFrame.from_dict({
    'publisher': 'washington_times',
    'date': wt_dates,
    'link': wt_links,
    'article_title': wt_titles,
    'article_text': wt_contents 
})

In [46]:
wt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON (AP) — The Democratic-controlled Ho...,House passes bipartisan coronavirus relief bil...,"Saturday, March 14, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
1,"The federal appeals court in Washington, D.C.,...",Full appeals court to hear challenges over bor...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
2,The top Democrat on the Senate Foreign Relatio...,Menendez calls on Trump to admit Iranian deter...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
3,In his quest to regain a Senate seat he once h...,Jeff Sessions gets the conservative establishm...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
4,Count Sen. Bernie Sanders among the people who...,Sanders says Trump should get tested for coron...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times


In [47]:
# read in old data
old_wt_data = pd.read_csv('data/wt_data.csv')
num_old = len(old_wt_data)

# append new data
wt_data = old_wt_data.append(wt_data).drop_duplicates()

# save new .csv
wt_data.to_csv("data/wt_data.csv", index = False)
num_now = len(wt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 27
total number of entries in new data: 30


### 4. Associated Press - Neutral

In [31]:
# load the HTML content using requests and save into a variable
ap_requests = requests.get('https://apnews.com/apf-politics')
ap_homepage = ap_requests.content

In [32]:
# create a soup to allow BeautifulSoup to work
ap_soup = BeautifulSoup(ap_homepage, 'html.parser')

In [33]:
# locate articles
ap_tags = ap_soup.find_all('a', class_="Component-headline-0-2-105")

In [34]:
# setup
number_of_articles = min(len(ap_tags), 30)

ap_links = []
ap_text = []
ap_titles = []
ap_dates = []

In [35]:
# get homepage article links
for link in ap_tags:
    link = link.get('href')
    link = "https://apnews.com" + link
    ap_links.append(link)

In [36]:
# prep for article content
for link in ap_links:
    ap_article_request = requests.get(link)
    ap_article = ap_article_request.content
    ap_article_soup = BeautifulSoup(ap_article, 'html.parser')
    
    # article titles
    title = ap_article_soup.find_all('meta')[14]
    title = title['content']
    ap_titles.append(title)
    
    # article date
    date = ap_article_soup.find_all('meta')[24]
    date = date['content']
    ap_dates.append(date)
    
    # article content: <div class="Article" data-key=Article.
    body = ap_article_soup.find_all('div')
    x = body[0].find_all('p')

    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        paragraph = paragraph.replace('CHICAGO (AP) -',"")
        paragraph = paragraph.replace('DETROIT (AP) -',"")
        paragraph = paragraph.replace('WASHINGTON (AP) -',"")
        paragraph = paragraph.replace('___ Catch up on the 2020 election campaign with AP experts on our weekly politics podcast, “Ground Game.',"")
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
    ap_text.append(final_article)

In [37]:
# join ap data
ap_data = pd.DataFrame.from_dict({
    'publisher': 'AP',
    'date': ap_dates,
    'link': ap_links,
    'article_title': ap_titles,
    'article_text': ap_text 
})

ap_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON (AP) — Americans normally hear from...,"AP FACT CHECK: Trump, American exceptionalism ...",2020-03-14T12:32:37Z,https://apnews.com/19963ea122c12eb72b8f0ff14e8...,AP
1,WASHINGTON (AP) — The House approved legislati...,House passes aid bill after Trump declares vir...,2020-03-14T05:44:03Z,https://apnews.com/cb2685490d126350d41cae7562c...,AP
2,And then there were two. Joe Biden and Bernie...,"Debate questions: Biden, Sanders are finally t...",2020-03-14T13:55:32Z,https://apnews.com/a47e1dd4f28e090fb2375cab87e...,AP
3,WASHINGTON (AP) — President Donald Trump said ...,Trump says he's likely to be tested after repe...,2020-03-14T02:12:54Z,https://apnews.com/715fb5cd41518ac46e3a73c85b5...,AP
4,WASHINGTON (AP) — Sen. Bernie Sanders won the...,"Sanders wins Northern Mariana Islands caucus, ...",2020-03-14T13:45:30Z,https://apnews.com/bdf5de197d0471306b47415c474...,AP


In [38]:
# read in old data
old_ap_data = pd.read_csv('data/ap_data.csv')
num_old = len(old_ap_data)

# append new data
ap_data = old_ap_data.append(ap_data).drop_duplicates()

# save new .csv
ap_data.to_csv("data/ap_data.csv", index = False)
num_now = len(ap_data)

In [39]:
# see number of articles
print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now-num_old))

number of entries in old data: 156
total number of entries in new data: 156
difference: 0


### 5. NBC - Center-Left

In [48]:
# load the HTML content using requests and save into a variable
nbc_request = requests.get('https://www.nbcnews.com/politics')
nbc_homepage = nbc_request.content

In [49]:
# create soup 
nbc_soup = BeautifulSoup(nbc_homepage, 'html.parser')

In [50]:
# locate article URLs
nbc_tags = nbc_soup.find_all('h2', class_="teaseCard__headline") + nbc_soup.find_all('h2', class_="title___2T5qK")

In [51]:
# setup
number_of_articles = len(nbc_tags)

# get article titles, content, and links
nbc_links = []
nbc_titles = []
nbc_dates = []
nbc_contents = []

In [52]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = nbc_tags[n].find('a')['href']
    nbc_links.append(link)
    
    # get article title
    title = nbc_tags[n].find('a').get_text()
    nbc_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    if soup_article.time != None:
        date = soup_article.time.attrs['datetime']
        date = date[4:-24] 
    else:
        date = None
    nbc_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_= 'article-body__content')    
    final_article = " ".join([item.text for item in body])
       
    nbc_contents.append(final_article)

In [53]:
# assembling data
nbc_data = pd.DataFrame.from_dict({
    'publisher': 'nbc',
    'date': nbc_dates,
    'link': nbc_links,
    'article_title': nbc_titles,
    'article_text': nbc_contents 
})

In [54]:
# dropping rows that are not text articles (these will have NA in date)
nbc_data = nbc_data.dropna()

In [55]:
nbc_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,President Donald Trump on Friday announced a n...,Trump declares national emergency to combat co...,Mar 13 2020,https://www.nbcnews.com/politics/donald-trump/...,nbc
1,With the coronavirus pandemic wreaking havoc o...,Coronavirus forces candidates to shift to 'vir...,Mar 13 2020,https://www.nbcnews.com/politics/2020-election...,nbc
2,WASHINGTON — Joe Biden is winning the delegate...,Biden's delicate dance to win over the 'Bernie...,Mar 11 2020,https://www.nbcnews.com/politics/2020-election...,nbc
3,The only way to beat President Donald Trump in...,Bill de Blasio: The Democratic primary isn't o...,Mar 11 2020,https://www.nbcnews.com/think/opinion/democrat...,nbc
4,"It's not Super Tuesday, but there are six more...",It gets a whole lot harder for Sanders today. ...,Mar 09 2020,https://www.nbcnews.com/politics/2020-election...,nbc


In [56]:
# read in old data
old_nbc_data = pd.read_csv('data/nbc_data.csv')
num_old = len(old_nbc_data)

# append new data
nbc_data = old_nbc_data.append(nbc_data).drop_duplicates()

# save new .csv
nbc_data.to_csv("data/nbc_data.csv", index = False)
num_now = len(nbc_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 52
total number of entries in new data: 57


### 6. New York Times - Liberal

In [57]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [58]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [59]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [60]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [61]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', {'class':['css-53u6y8', 'css-1fanzo5']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [62]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', attrs = {'class':['css-53u6y8', 'css-1fanzo5 StoryBodyCompanionColumn']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [63]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [64]:
# make sure it looks nice
nyt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON — President Trump declared a nation...,House Passes Coronavirus Relief After Democrat...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
1,As he declared a national emergency over the c...,Trump’s False Claims About His Response to the...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
2,WASHINGTON — There was one big question loomin...,"Trump Won’t Be Getting a Coronavirus Test, His...",2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
3,The magnitude of former Vice President Joseph ...,Joe Biden’s Young Voter Problem: They Don’t Th...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
4,The coronavirus outbreak is inflicting new dis...,Could the 2020 Election Be Postponed? Only Wit...,2020-03-14,https://www.nytimes.com/2020/03/14/us/politics...,new_york_times


In [65]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')
num_old = len(old_nyt_data)

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)
num_now = len(nyt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 55
total number of entries in new data: 69


### 7. Politico - Liberal

In [66]:
# load the HTML content using requests and save into a variable
politico_request = requests.get('https://www.politico.com/politics')
politico_homepage = politico_request.content

In [67]:
# create soup 
politico_soup = BeautifulSoup(politico_homepage, 'html.parser')

In [68]:
# locate article URLs
politico_tags = politico_soup.find_all('h3')

In [69]:
# setup
number_of_articles = len(politico_tags)

# get article titles, content, and links
politico_links = []
politico_titles = []
politico_dates = []
politico_contents = []

In [70]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = politico_tags[n].find('a')['href']
    politico_links.append(link)
    
    # get article title
    title = politico_tags[n].find('a').get_text()
    politico_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-9]
    politico_dates.append(date)
    
    # get article content
    body = soup_article.find_all('p', attrs={'class':'story-text__paragraph'})
    final_article = " ".join([item.text for item in body])
    
    politico_contents.append(final_article)

In [71]:
# assembling data
politico_data = pd.DataFrame.from_dict({
    'publisher': 'politico',
    'date': politico_dates,
    'link': politico_links,
    'article_title': politico_titles,
    'article_text': politico_contents 
})

In [72]:
# dropping rows that are not text articles (these will have NA in text)
politico_data = politico_data.dropna()

In [73]:
politico_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,In the weeks since the coronavirus outbreak fi...,What Trump’s Twitter Feed Tells Him About the ...,2020-03-14,https://www.politico.com/news/magazine/2020/03...,politico
1,As Americans brace for the rapid spread of the...,Who gets saved and who collapses?,2020-03-14,https://www.politico.com/news/2020/03/14/trump...,politico
2,"Arkansas Republican Tom Cotton, a freshman sen...",Two senators take extreme measures to show all...,2020-03-14,https://www.politico.com/news/2020/03/14/cotto...,politico
3,Local officials from around the country are wo...,A health system overwhelmed,2020-03-14,https://www.politico.com/news/2020/03/14/healt...,politico
4,Joe Biden has long complained about the crowde...,Biden readies plan to finish off Bernie,2020-03-14,https://www.politico.com/news/2020/03/14/biden...,politico


In [74]:
# read in old data
old_politico_data = pd.read_csv('data/politico_data.csv')
num_old = len(old_politico_data)

# append new data
politico_data = old_politico_data.append(politico_data).drop_duplicates()

# save new .csv
politico_data.to_csv("data/politico_data.csv", index = False)
num_now = len(politico_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 69
total number of entries in new data: 73


### 8. Buzzfeed - Very Liberal

In [75]:
# load the HTML content using requests and save into a variable
buzz_request = requests.get('https://www.buzzfeednews.com/section/politics')
buzz_homepage = buzz_request.content

In [76]:
# create soup 
buzz_soup = BeautifulSoup(buzz_homepage, 'html.parser')

In [77]:
# locate article URLs
buzz_tags = buzz_soup.find_all('h2')

In [78]:
# setup
number_of_articles = min(len(buzz_tags), 30)

# get article titles, content, and links
buzz_links = []
buzz_titles = []
buzz_dates = []
buzz_contents = []

In [79]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = buzz_tags[n].find('a')['href']
    buzz_links.append(link)
    
    # get article title
    title = buzz_tags[n].find('a').get_text()
    buzz_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.find_all('div', class_="news-article-header__timestamps")    
    date = " ".join([item.text for item in date]).replace('\n', '')
    buzz_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', attrs={'data-module':'subbuzz-text'})
    article = " ".join([item.text for item in body]).replace('\n', '')
    final_article = re.sub(r' {[^}]*}', '', article)
        
    buzz_contents.append(final_article)

In [80]:
# assembling data
buzz_data = pd.DataFrame.from_dict({
    'publisher': 'buzzfeed',
    'date': buzz_dates,
    'link': buzz_links,
    'article_title': buzz_titles,
    'article_text': buzz_contents 
})

In [81]:
buzz_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON — When Washington state emerge...,A Senator From The State Hit Hardest By Corona...,"Posted on March 13, 2020, at 9:28 ...",https://www.buzzfeednews.com/article/kadiagoba...,buzzfeed
1,WASHINGTON — After shaking hands with sev...,Trump Said He Won't Self-Isolate After Coming ...,"Posted on March 13, 2020, at 5:41 ...",https://www.buzzfeednews.com/article/paulmcleo...,buzzfeed
2,"On Thursday, a prominent Chinese diplomat...",Chinese Diplomats Are Pushing Conspiracy Theor...,"Last updated on March 13, 2020, at...",https://www.buzzfeednews.com/article/ryanhates...,buzzfeed
3,President Trump declared a national state...,Trump Declared A National State Of Emergency O...,"Last updated on March 13, 2020, at...",https://www.buzzfeednews.com/article/clarissaj...,buzzfeed
4,Thousands of Americans think President Do...,What Happens If Trump Tries To Cancel The Elec...,"Posted on March 13, 2020, at 2:12 ...",https://www.buzzfeednews.com/article/dominicho...,buzzfeed


In [82]:
# read in old data
old_buzz_data = pd.read_csv('data/buzzfeed_data.csv')
num_old = len(old_buzz_data)

# append new data
buzz_data = old_buzz_data.append(buzz_data).drop_duplicates()

# save new .csv
buzz_data.to_csv("data/buzzfeed_data.csv", index = False)
num_now = len(buzz_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 54
total number of entries in new data: 60
