# Web Scraper Tool for 5 US Media Outlets

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [163]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [164]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [165]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [166]:
number_of_articles = min(len(breitbart_tags), 30)

# get article titles, content, and links
breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [7]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [35]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,A group of Conservative party grandees will se...,Tories Rebel over Huawei: As ‘Ridiculous’ as G...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/to...,Breitbart
1,Greece has accused the Turkish government of s...,Greece Denounces ‘Fake News’ as Turkey Claims ...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/gr...,Breitbart
2,Dramatic pictures and video are emerging of fi...,PICTURES: Greek Border in Flames as Migrants K...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/pi...,Breitbart
3,Finland’s millennial feminist-led government f...,Finland’s Millennial Feminist Govt to Help Gre...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/fi...,Breitbart
4,A total of 76 per cent of the Greek public sup...,Nearly 8 in 10 Greeks Support Government’s Bor...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/ne...,Breitbart


In [15]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)

### 2. Fox - Conservative

Get links from the Fox Politics homepage

In [3]:
# https://www.foxnews.com/category/politics/2020-presidential-election <- 127 links // 15  without junk
# https://www.foxnews.com/politics <- 150 links (but junk?) // 31 without junk
# i also tried running this for each candidate's section but there is overlap in articles

# load the HTML content using requests and save into a variable
r2 = requests.get('https://www.foxnews.com/politics')
homepage2 = r2.content

# create a soup to allow BeautifulSoup to work
soup2 = BeautifulSoup(homepage2, 'html.parser')

# locate and retrieve all links - WAS NOT ABLE TO ISOLATE JUST ARTICLE LINKS BECUASE NO UNIQUE TAG ELEMENT!!
# you can see how complex the homepage is: print(soup2.prettify())
homepage_tags2 = soup2.find_all('a')
homepage_links2 = []

for link in homepage_tags2:
    homepage_links2.append(link.get('href'))

# remove duplicates by turning list into a set
homepage_links2 = set(homepage_links2)

In [4]:
# remove junk links from list <- MAYBE THIS COULD BE DONE IN A BETTER WAY??
# creating a list of exact junk links that will never be needed when pulling from the homepage
junk_links2 = ['/', '/us', '/world', '/opinion', '/politics', '/entertainment', '//www.foxbusiness.com', 
                  '/lifestyle', '/shows', '//www.foxnews.com/shows/fox-nation', '//radio.foxnews.com/podcast',
                  '#', '//foxnews.com/weather/your-weather/index.html', '#', '#', 
                  '//video.foxnews.com/v/5614615980001/?#sp=watch-live', '#', '/us', '/world', '/opinion', '/politics', 
                  '/official-polls', '/category/politics/elections', 
                  '/entertainment', '//video.foxnews.com/playlist/entertainment-latest-entertainment/', 
                  '//www.foxbusiness.com/', '//www.foxbusiness.com/markets', '//www.foxbusiness.com/politics', 
                  '//www.foxbusiness.com/category/technology', '//www.foxbusiness.com/features', 
                  '//www.foxbusiness.com/category/business-leaders', '/lifestyle', '/food-drink', '/auto', 
                  '/travel', '/family', '/science', '/tech', '/health', '/shows', '/shows', 
                  '/person/personalities', '//video.foxnews.com/v/5614615980001/?#sp=watch-live', 
                  '//video.foxnews.com/playlist/episodic-most-recent-episodes/', 
                  '//video.foxnews.com/#sp=show-clips', '//video.foxnews.com/#sp=news-clips', 
                  '//www.foxnews.com/contact', '//foxcareers.com/Search/SearchResults?brand=Fox%20News%20Careers', 
                  '/foxaroundtheworld/', 'mailto:adsales@foxnews.com?subject=Advertising%20Inquiry', 
                  '//press.foxnews.com/media-contacts/', '//press.foxnews.com/', '/compliance', 
                  'https://supplierdiversity.foxnews.com/', '//www.foxnews.com/shows/fox-nation', '//shop.foxnews.com', 
                  '/go', '//radio.foxnews.com/', '/alerts/subscribe', '/newsletter-signup/alerts', '//radio.foxnews.com/podcast', 
                  '/apps-products', '//www.foxnews.com', '/terms-of-use', '/privacy-policy', '/donotsell', '/closed-captioning', 
                  '//help.foxnews.com', '/contact', '//www.facebook.com/FoxNews', '//twitter.com/foxnews', '//www.google.com/+FoxNews', 
                  '//www.instagram.com/foxnews', '/about/rss/', '/alerts/subscribe', 'https://flipboard.com/@FoxNews', 
                  '//www.foxnews.com/rss/index.html', '//www.foxnews.com/alerts/subscribe.html', 
                   '/accessibility-statement', 'https://video.foxnews.com/v/', 'https://foxnews.com/elections']

# removing all links that lead to section pages AND other specified junk URLs
homepage_links2 = [x for x in homepage_links2 if "/category" not in x] 
homepage_links2 = [x for x in homepage_links2 if "/v/" not in x] 
homepage_links2 = [x for x in homepage_links2 if x not in junk_links2]

In [20]:
# add https://www.foxnews.com to links that don't have this
# create new list for prepared links
hp_links2 = []

for item in homepage_links2:
    new_item = 'https://www.foxnews.com' + item
    hp_links2.append(new_item)

print(hp_links2)

['https://www.foxnews.com/media/chris-matthews-resignation-from-msnbc-hastened-by-series-of-blunders', 'https://www.foxnews.com/politics/warren-huddles-with-advisers-as-progressive-pressure-for-her-to-drop-out-mounts', 'https://www.foxnews.com/politics/supreme-court-at-apparent-odds-over-key-abortion-case-over-clinic-access-restrictions', 'https://www.foxnews.com/politics/fox-news-poll-sanders-knocks-biden-out-of-first-majority-thinks-trump-wins', 'https://www.foxnews.com/politics/issa-mounts-a-comeback-in-california', 'https://www.foxnews.com/politics/biden-campaign-hits-back-at-sanders-ad-showing-obama-praise', 'https://www.foxnews.com/media/francis-tarlov-joe-biden-trump-clinton', 'https://www.foxnews.com/politics/pence-says-passengers-on-flights-from-italy-and-south-korea-will-be-screened-multiple-times-for-coronavirus', 'https://www.foxnews.com/politics/mike-bloomberg-suspends-presidential-campaign-after-super-tuesday-show', 'https://www.foxnews.com/politics/aoc-aligned-progressiv

In [6]:
# TO DO: check for duplicates to set of existing URLs to make sure we only pull data for new ones
    # import full dataset in
    # remove urls in homepage_links3 that are already included in the full dataset

Get article text and other relevent data from new articles

In [7]:
# create a pandas dataframe where all of the retrieved data will be stored
#article_data2 = pd.DataFrame()

#article_data2['headline'] = []
#article_data2['date_published'] = []
#article_data2['date_modified'] = []
#article_data2['description'] = []
#article_data2['author'] = []
#article_data2['url'] = []

#export empty df to file
#article_data2.to_csv('Fox_Data.csv')

In [18]:
text = []
headline = []
date_published = []
date_modified = []
description = []
author = []
url = []

for link in hp_links2:
    # load the HTML content using requests and save into a variable
    r2_article = requests.get(link)
    article2 = r2_article.content

    # create a soups to allow BeautifulSoup to work
    article_soup2 = BeautifulSoup(article2, 'html.parser')

    # get article text
    article_text = article_soup2.find("body").get_text()
    text.append(text)
    
    # retrieve article specific metadata
    metadata2 = article_soup2.find_all("script")[2].get_text()
    metadata2 = metadata2.split(",")

    for item in metadata2:
        if 'headline' in item:
            headline.append(item)
        elif 'datePublished' in item:
            date_published.append(item)
        elif 'dateModified' in item:
            date_modified.append(item)
        elif 'description' in item:
            description.append(item)
        elif 'name' in item and 'Fox News' not in item:
            author.append(item)
        elif 'mainEntityOfPage' in item:
            url.append(item)

In [19]:
print('text', len(text))
print('headline', len(headline))
print('date_published', len(date_published))
print('date_modified', len(date_modified))
print('description', len(description))
print('author', len(author))
print('url', len(url))

text 22
headline 22
date_published 22
date_modified 22
description 22
author 21
url 22


In [9]:
#article_data2 = pd.DataFrame({'text':text,'title':headline, 'date_published':date_published, 'date_modified':date_modified, 'description':description, 'author':author, 'url':url})

ValueError: arrays must all be same length

In [None]:
# import collection of all responses
all_articles2 = pd.read_csv("Fox_Data.csv", index_col=[0])

# merge new increment of data to the csv
all_articles2 = article_data2.append(article_data2, ignore_index=True)

#export with new increment to save
all_articles.to_csv('Fox_Data.csv')

In [144]:
# another spot the author is located just in case
# test = article_soup3.find_all("meta")[17]
#print(yay)

<meta content="Dana Blanton" data-hid="dc.creator" data-n-head="true" name="dc.creator" scheme="dcterms.creator"/>


### 3. Wall Street Journal - Neutral

### 4. New York Times - Liberal

In [134]:
# load the HTML content using requests and save into a variable
nyt_request = requests.get('https://www.nytimes.com/section/politics')
nyt_homepage = nyt_request.content

In [135]:
# create soup 
nyt_soup = BeautifulSoup(nyt_homepage, 'html.parser')

In [181]:
# homepage URLs
nyt_tags_home = nyt_soup.find_all('h2', class_="css-l2vidh e4e4i5l1")

# archive URLs
nyt_tags_archive = nyt_soup.find_all('div', class_='css-1l4spti')

In [182]:
# setup 
nyt_links = []
nyt_titles = []
nyt_dates = []
nyt_contents = []

In [183]:
# homepage articles
for n in np.arange(0, len(nyt_tags_home)):

    # get article link
    link = nyt_tags_home[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_home[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='css-53u6y8')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    nyt_contents.append(final_article)

In [184]:
# archive articles
for n in np.arange(0, len(nyt_tags_archive)):

    # get article link
    link = nyt_tags_archive[n].find('a')['href']
    link = "https://www.nytimes.com" + link
    nyt_links.append(link)
    
    # get article title
    title = nyt_tags_archive[n].find('a').get_text()
    nyt_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-15]
    nyt_dates.append(date)
        
    # get article content
    body = soup_article.find_all('div', class_='css-53u6y8')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    nyt_contents.append(final_article)

In [186]:
# assembling data
nyt_data = pd.DataFrame.from_dict({
    'publisher': 'new_york_times',
    'date': nyt_dates,
    'link': nyt_links,
    'article_title': nyt_titles,
    'article_text': nyt_contents 
})

In [187]:
# make sure it looks nice
nyt_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,"WEST PALM BEACH, Fla. — President Trump on Fri...","Trump Names Mark Meadows Chief of Staff, Ousti...",2020-03-07,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
1,President Trump claimed again on Friday that a...,"With Test Kits in Short Supply, Health Officia...",2020-03-07,https://www.nytimes.com/2020/03/06/health/test...,new_york_times
2,Bernie Sanders was several takes into a video ...,The Bernie Sanders Personality Test,2020-03-06,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
3,Joseph R. Biden Jr.’s campaign organization in...,Joe Biden Has Had Flimsy Organization. It Hasn...,2020-03-06,https://www.nytimes.com/2020/03/06/us/politics...,new_york_times
4,WASHINGTON — After weeks of conflicting signal...,"Miscommunication, Confusion and Fear Mar White...",2020-03-07,https://www.nytimes.com/2020/03/07/us/politics...,new_york_times
5,"WASHINGTON — Erik Prince, the security contrac...",Erik Prince Recruits Ex-Spies to Help Infiltra...,2020-03-07,https://www.nytimes.com/2020/03/07/us/politics...,new_york_times
6,Lucio Delgado was excited when he went to the ...,"With No Braille Option, a Blind Man Failed His...",2020-03-07,https://www.nytimes.com/2020/03/07/us/citizens...,new_york_times
7,"In just seven days, the Democratic race has ch...",Biden Is Back: This Week in the 2020 RaceJosep...,2020-03-07,https://www.nytimes.com/2020/03/07/us/politics...,new_york_times
8,"MADISON, Wis. — Debt has dogged Brian Michelz ...","A Sanders Voter, Weary of Debt at 29: ‘I Have ...",2020-03-07,https://www.nytimes.com/2020/03/07/us/bernie-s...,new_york_times
9,When Senator Elizabeth Warren dropped out of t...,The Hidden Venmo Economy of Campaign Staffers ...,2020-03-07,https://www.nytimes.com/2020/03/07/us/politics...,new_york_times


In [188]:
# read in old data
old_nyt_data = pd.read_csv('data/nyt_data.csv')

# append new data
nyt_data = old_nyt_data.append(nyt_data).drop_duplicates()

# save new .csv
nyt_data.to_csv("data/nyt_data.csv", index = False)