# Web Scraper Tool for US Media Outlets

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [2]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [3]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [4]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [5]:
# setup
number_of_articles = min(len(breitbart_tags), 30)

breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

In [6]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [7]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [8]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,Greek authorities have deployed large fans at ...,Greece Deploys Large Fans to Blow Back Migrant...,2020-03-14,https://www.breitbart.com/europe/2020/03/14/gr...,Breitbart
1,President Donald Trump announced during his pr...,Trump Orders Department of Energy to Replenish...,2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Breitbart
2,Former Vice President Joe Biden (D) is refusin...,"Joe Biden Refuses to Listen to Experts, Reject...",2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Breitbart
3,President Donald Trump on Friday night announc...,Donald Trump Endorses Nancy Pelosi Coronavirus...,2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Breitbart
4,President Donald Trump suggested that he would...,Donald Trump Says He Will Get Tested for Coron...,2020-03-13,https://www.breitbart.com/politics/2020/03/13/...,Breitbart


In [9]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')
num_old = len(old_breitbart_data)

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)
num_now = len(breitbart_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 180
total number of entries in new data: 210


### 2. Fox - Conservative

In [10]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [11]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [12]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [7]:
number_of_articles = 30

fox_links = []

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [8]:
fox_text = []
fox_titles = []
fox_dates = []

# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [9]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Fox,2020-03-07T154057-0500,https://foxnews.com/politics/pence-coronavirus...,Pence calls for 'whole of America' ap...,VP Pence meets with ...
1,Fox,2020-03-06T211054-0500,https://foxnews.com/politics/house-prepares-te...,House prepares for telework scenarios...,Fox News Flash top h...
2,Fox,2020-03-07T131303-0500,https://foxnews.com/politics/sanders-biden-ent...,Sanders says Biden can’t ‘generate en...,Former Vice Presiden...
3,Fox,2020-03-07T154057-0500,https://foxnews.com/politics/pence-coronavirus...,Pence calls for 'whole of America' ap...,VP Pence meets with ...
4,Fox,2020-03-07T131303-0500,https://foxnews.com/politics/sanders-biden-ent...,Sanders says Biden can’t ‘generate en...,Former Vice Presiden...


In [12]:
fox_data.to_csv("data/fox_data.csv", index = False)

In [208]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)

### 3. The Washington Times - Center Right

In [595]:
# load the HTML content using requests and save into a variable
wt_request = requests.get('https://www.washingtontimes.com/news/politics/')
wt_homepage = wt_request.content

In [596]:
# create soup 
wt_soup = BeautifulSoup(wt_homepage, 'html.parser')

In [597]:
# locate article URLs
wt_tags = wt_soup.find_all('h2', class_="article-headline")

In [626]:
# setup
number_of_articles = len(wt_tags)

# get article titles, content, and links
wt_links = []
wt_titles = []
wt_dates = []
wt_contents = []

In [627]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = wt_tags[n].find('a')['href']
    link = 'https://www.washingtontimes.com' + link
    wt_links.append(link)
    
    # get article title
    title = wt_tags[n].find('a').get_text()
    wt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    meta = soup_article.find("div", class_="meta").find("span", class_="source").text
    strip = meta.replace(' -\n\t\t\t\n\t\t\t\tAssociated Press\n -\n                      \n                        \n                        ', '')
    strip = strip.replace(' -\n\t\t\t\n\t\t\t\tThe Washington Times\n -\n                      \n                        \n                        ', '')
    date = strip.replace('\n                      \n                    ', '')
    wt_dates.append(date)
    
    # get article content
    for div in soup_article.find_all("div", {'class':'article-toplinks'}): 
        div.decompose()
    
    body = soup_article.find_all('div', class_= 'bigtext')  
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs).split("\n")[0]
       
    wt_contents.append(final_article)

In [628]:
# assembling data
wt_data = pd.DataFrame.from_dict({
    'publisher': 'washington_times',
    'date': wt_dates,
    'link': wt_links,
    'article_title': wt_titles,
    'article_text': wt_contents 
})

In [630]:
wt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON (AP) — The Democratic-controlled Ho...,House passes bipartisan coronavirus relief bil...,"Saturday, March 14, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
1,"The federal appeals court in Washington, D.C.,...",Full appeals court to hear challenges over bor...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
2,The top Democrat on the Senate Foreign Relatio...,Menendez calls on Trump to admit Iranian deter...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
3,In his quest to regain a Senate seat he once h...,Jeff Sessions gets the conservative establishm...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times
4,Count Sen. Bernie Sanders among the people who...,Sanders says Trump should get tested for coron...,"Friday, March 13, 2020",https://www.washingtontimes.com/news/2020/mar/...,washington_times


In [632]:
# read in old data
old_wt_data = pd.read_csv('data/wt_data.csv')
num_old = len(old_wt_data)

# append new data
wt_data = old_wt_data.append(wt_data).drop_duplicates()

# save new .csv
wt_data.to_csv("data/wt_data.csv", index = False)
num_now = len(wt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 27
total number of entries in new data: 27


### 4. Wall Street Journal - Neutral

In [4]:
# load the HTML content using requests and save into a variable
wsj_requests = requests.get('https://www.wsj.com/news/politics')
wsj_homepage = wsj_requests.content

In [5]:
# create a soup to allow BeautifulSoup to work
wsj_soup = BeautifulSoup(wsj_homepage, 'html.parser')

In [6]:
# locate article links
wsj_tags = wsj_soup.find_all('div')
# print(wsj_tags)

# LOOKING FOR -- a class="wsj-headline-link" 

### 5. NBC - Center-Left

In [366]:
# load the HTML content using requests and save into a variable
nbc_request = requests.get('https://www.nbcnews.com/politics')
nbc_homepage = nbc_request.content

In [367]:
# create soup 
nbc_soup = BeautifulSoup(nbc_homepage, 'html.parser')

In [368]:
# locate article URLs
nbc_tags = nbc_soup.find_all('h2', class_="teaseCard__headline") + nbc_soup.find_all('h2', class_="title___2T5qK")

In [369]:
# setup
number_of_articles = len(nbc_tags)

# get article titles, content, and links
nbc_links = []
nbc_titles = []
nbc_dates = []
nbc_contents = []

In [370]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = nbc_tags[n].find('a')['href']
    nbc_links.append(link)
    
    # get article title
    title = nbc_tags[n].find('a').get_text()
    nbc_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    if soup_article.time != None:
        date = soup_article.time.attrs['datetime']
        date = date[4:-24] 
    else:
        date = None
    nbc_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_= 'article-body__content')    
    final_article = " ".join([item.text for item in body])
       
    nbc_contents.append(final_article)

In [371]:
# assembling data
nbc_data = pd.DataFrame.from_dict({
    'publisher': 'nbc',
    'date': nbc_dates,
    'link': nbc_links,
    'article_title': nbc_titles,
    'article_text': nbc_contents 
})

In [372]:
# dropping rows that are not text articles (these will have NA in date)
nbc_data = nbc_data.dropna()

In [373]:
nbc_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,President Donald Trump on Friday announced a n...,Trump declares national emergency to combat co...,Mar 13 2020,https://www.nbcnews.com/politics/donald-trump/...,nbc
1,With the coronavirus pandemic wreaking havoc o...,Coronavirus forces candidates to shift to 'vir...,Mar 13 2020,https://www.nbcnews.com/politics/2020-election...,nbc
2,WASHINGTON — Joe Biden is winning the delegate...,Biden's delicate dance to win over the 'Bernie...,Mar 11 2020,https://www.nbcnews.com/politics/2020-election...,nbc
3,The only way to beat President Donald Trump in...,Bill de Blasio: The Democratic primary isn't o...,Mar 11 2020,https://www.nbcnews.com/think/opinion/democrat...,nbc
4,"It's not Super Tuesday, but there are six more...",It gets a whole lot harder for Sanders today. ...,Mar 09 2020,https://www.nbcnews.com/politics/2020-election...,nbc


In [374]:
# read in old data
old_nbc_data = pd.read_csv('data/nbc_data.csv')
num_old = len(old_nbc_data)

# append new data
nbc_data = old_nbc_data.append(nbc_data).drop_duplicates()

# save new .csv
nbc_data.to_csv("data/nbc_data.csv", index = False)
num_now = len(nbc_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 20
total number of entries in new data: 52


### 6. New York Times - Liberal

In [27]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [28]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [29]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [41]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [42]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', {'class':['css-53u6y8', 'css-1fanzo5']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [43]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', attrs = {'class':['css-53u6y8', 'css-1fanzo5 StoryBodyCompanionColumn']})
    final_article = " ".join([item.text for item in body])
        
    nyt_contents.append(final_article)

In [44]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [46]:
# make sure it looks nice
nyt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
9,WASHINGTON — The Justice Department this week ...,Justice Dept. Religious Freedom Training Spurs...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
10,“Today I’d like to provide an update to the Am...,Transcript: Trump’s Coronavirus News Conferenc...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
11,WASHINGTON — The Centers for Disease Control a...,Administration Offers Guidance to Schools as T...,2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
12,"Tom Turnipseed, who after working on the presi...","Tom Turnipseed, a ‘Reformed Racist’ After Back...",2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times
13,"WASHINGTON — Despite the worsening pandemic, t...","Amid a Pandemic, Trump Moves Forward With Safe...",2020-03-13,https://www.nytimes.com/2020/03/13/us/politics...,new_york_times


In [47]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')
num_old = len(old_nyt_data)

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)
num_now = len(nyt_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 69
total number of entries in new data: 83


### 7. Politico - Liberal

In [407]:
# load the HTML content using requests and save into a variable
politico_request = requests.get('https://www.politico.com/politics')
politico_homepage = politico_request.content

In [408]:
# create soup 
politico_soup = BeautifulSoup(politico_homepage, 'html.parser')

In [409]:
# locate article URLs
politico_tags = politico_soup.find_all('h3')

In [420]:
# setup
number_of_articles = len(politico_tags)

# get article titles, content, and links
politico_links = []
politico_titles = []
politico_dates = []
politico_contents = []

In [421]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = politico_tags[n].find('a')['href']
    politico_links.append(link)
    
    # get article title
    title = politico_tags[n].find('a').get_text()
    politico_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-9]
    politico_dates.append(date)
    
    # get article content
    body = soup_article.find_all('p', attrs={'class':'story-text__paragraph'})
    final_article = " ".join([item.text for item in body])
    
    politico_contents.append(final_article)

In [422]:
# assembling data
politico_data = pd.DataFrame.from_dict({
    'publisher': 'politico',
    'date': politico_dates,
    'link': politico_links,
    'article_title': politico_titles,
    'article_text': politico_contents 
})

In [427]:
# dropping rows that are not text articles (these will have NA in text)
politico_data = politico_data.dropna()

In [428]:
politico_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,In the weeks since the coronavirus outbreak fi...,What Trump’s Twitter Feed Tells Him About the ...,2020-03-14,https://www.politico.com/news/magazine/2020/03...,politico
1,As Americans brace for the rapid spread of the...,Trump’s monumental challenge: Rescuing industr...,2020-03-14,https://www.politico.com/news/2020/03/14/trump...,politico
2,"Arkansas Republican Tom Cotton, a freshman sen...",Two senators take extreme measures to show fea...,2020-03-14,https://www.politico.com/news/2020/03/14/cotto...,politico
3,Local officials from around the country are wo...,Local officials: We’re not ready,2020-03-14,https://www.politico.com/news/2020/03/14/healt...,politico
4,Joe Biden has long complained about the crowde...,Biden readies plan to finish off Bernie,2020-03-14,https://www.politico.com/news/2020/03/14/biden...,politico


In [429]:
# read in old data
old_politico_data = pd.read_csv('data/politico_data.csv')
num_old = len(old_politico_data)

# append new data
politico_data = old_politico_data.append(politico_data).drop_duplicates()

# save new .csv
politico_data.to_csv("data/politico_data.csv", index = False)
num_now = len(politico_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 69
total number of entries in new data: 69


### 8. Buzzfeed - Very Liberal

In [18]:
# load the HTML content using requests and save into a variable
buzz_request = requests.get('https://www.buzzfeednews.com/section/politics')
buzz_homepage = buzz_request.content

In [19]:
# create soup 
buzz_soup = BeautifulSoup(buzz_homepage, 'html.parser')

In [20]:
# locate article URLs
buzz_tags = buzz_soup.find_all('h2')

In [21]:
# setup
number_of_articles = min(len(buzz_tags), 30)

# get article titles, content, and links
buzz_links = []
buzz_titles = []
buzz_dates = []
buzz_contents = []

In [22]:
# get article titles, content, and links
for n in np.arange(0, number_of_articles):

    # get article link
    link = buzz_tags[n].find('a')['href']
    buzz_links.append(link)
    
    # get article title
    title = buzz_tags[n].find('a').get_text()
    buzz_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.find_all('div', class_="news-article-header__timestamps")    
    date = " ".join([item.text for item in date]).replace('\n', '')
    buzz_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', attrs={'data-module':'subbuzz-text'})
    article = " ".join([item.text for item in body]).replace('\n', '')
    final_article = re.sub(r' {[^}]*}', '', article)
        
    buzz_contents.append(final_article)

In [23]:
# assembling data
buzz_data = pd.DataFrame.from_dict({
    'publisher': 'buzzfeed',
    'date': buzz_dates,
    'link': buzz_links,
    'article_title': buzz_titles,
    'article_text': buzz_contents 
})

In [24]:
buzz_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,WASHINGTON — When Washington state emerge...,A Senator From The State Hit Hardest By Corona...,"Posted on March 13, 2020, at 9:28 ...",https://www.buzzfeednews.com/article/kadiagoba...,buzzfeed
1,WASHINGTON — After shaking hands with sev...,Trump Said He Won't Self-Isolate After Coming ...,"Posted on March 13, 2020, at 5:41 ...",https://www.buzzfeednews.com/article/paulmcleo...,buzzfeed
2,"On Thursday, a prominent Chinese diplomat...",Chinese Diplomats Are Pushing Conspiracy Theor...,"Last updated on March 13, 2020, at...",https://www.buzzfeednews.com/article/ryanhates...,buzzfeed
3,President Trump declared a national state...,Trump Declared A National State Of Emergency O...,"Last updated on March 13, 2020, at...",https://www.buzzfeednews.com/article/clarissaj...,buzzfeed
4,Thousands of Americans think President Do...,What Happens If Trump Tries To Cancel The Elec...,"Posted on March 13, 2020, at 2:12 ...",https://www.buzzfeednews.com/article/dominicho...,buzzfeed


In [25]:
# read in old data
old_buzz_data = pd.read_csv('data/buzzfeed_data.csv')
num_old = len(old_buzz_data)

# append new data
buzz_data = old_buzz_data.append(buzz_data).drop_duplicates()

# save new .csv
buzz_data.to_csv("data/buzzfeed_data.csv", index = False)
num_now = len(buzz_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))

number of entries in old data: 57
total number of entries in new data: 62
