# Web Scraper Tool for 5 US Media Outlets

In [3]:
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd

### 1. Breitbart - Very Conservative

In [3]:
# load the HTML content using requests and save into a variable
breitbart_request = requests.get('https://www.breitbart.com/politics/')
breitbart_homepage = breitbart_request.content

In [4]:
# create soup 
breitbart_soup = BeautifulSoup(breitbart_homepage, 'html.parser')

In [5]:
# locate article URLs
breitbart_tags = breitbart_soup.find_all('h2')

In [6]:
number_of_articles = 30

# get article titles, content, and links
breitbart_links = []
breitbart_titles = []
breitbart_dates = []
breitbart_contents = []

for n in np.arange(0, number_of_articles):

    # get article link
    link = breitbart_tags[n].find('a')['href']
    link = "https://www.breitbart.com" + link
    breitbart_links.append(link)
    
    # get article title
    title = breitbart_tags[n].find('a').get_text()
    breitbart_titles.append(title)
    
    # prep article content
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    
    # get publication datetime
    date = soup_article.time.attrs['datetime']
    date = date[:-10]
    breitbart_dates.append(date)
    
    # get article content
    body = soup_article.find_all('div', class_='entry-content')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    breitbart_contents.append(final_article)

In [7]:
# assembling data
breitbart_data = pd.DataFrame.from_dict({
    'publisher': 'Breitbart',
    'date': breitbart_dates,
    'link': breitbart_links,
    'article_title': breitbart_titles,
    'article_text': breitbart_contents 
})

In [35]:
# make sure it looks nice
breitbart_data.head()

Unnamed: 0,article_text,article_title,date,link,publisher
0,A group of Conservative party grandees will se...,Tories Rebel over Huawei: As ‘Ridiculous’ as G...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/to...,Breitbart
1,Greece has accused the Turkish government of s...,Greece Denounces ‘Fake News’ as Turkey Claims ...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/gr...,Breitbart
2,Dramatic pictures and video are emerging of fi...,PICTURES: Greek Border in Flames as Migrants K...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/pi...,Breitbart
3,Finland’s millennial feminist-led government f...,Finland’s Millennial Feminist Govt to Help Gre...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/fi...,Breitbart
4,A total of 76 per cent of the Greek public sup...,Nearly 8 in 10 Greeks Support Government’s Bor...,2020-03-07,https://www.breitbart.com/europe/2020/03/07/ne...,Breitbart


In [15]:
# read in old data
old_breitbart_data = pd.read_csv('data/breitbart_data.csv')

# append new data
breitbart_data = old_breitbart_data.append(breitbart_data).drop_duplicates()

# save new .csv
breitbart_data.to_csv("data/breitbart_data.csv", index = False)

### 2. Fox - Conservative

Get links from the Fox Politics homepage

In [4]:
# load the HTML content using requests and save into a variable
fox_requests = requests.get('https://www.foxnews.com/politics')
fox_homepage = fox_requests.content

In [5]:
# create a soup to allow BeautifulSoup to work
fox_soup = BeautifulSoup(fox_homepage, 'html.parser')

In [6]:
# locate article links
fox_tags = fox_soup.find_all('article')

In [7]:
number_of_articles = 30

fox_links = []

# get homepage article links
for n in np.arange(0, number_of_articles):
    link = fox_tags[n].find('a')
    link = link.get('href')
    link = "https://foxnews.com" + link
    fox_links.append(link)
    fox_links = [x for x in fox_links if "/v/" not in x]

In [8]:
fox_text = []
fox_titles = []
fox_dates = []

# prep for article content
for link in fox_links:
    fox_article_request = requests.get(link)
    fox_article = fox_article_request.content
    fox_article_soup = BeautifulSoup(fox_article, 'html.parser')
    
    # get article metadata
    fox_metadata = fox_article_soup.find_all('script')[2].get_text()
    fox_metadata = fox_metadata.split(",")
    
    for item in fox_metadata:

        # get article title
        if 'headline' in item:
            item = item.replace('\n',"")
            item = item.replace('headline', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_titles.append(item)
        
        # get article date
        elif 'datePublished' in item:
            item = item.replace('\n',"")
            item = item.replace('datePublished', "")
            item = item.replace(':', "")
            item = item.replace('"', '')
            fox_dates.append(item)
    
    # get article text
    body = fox_article_soup.find_all('div')
    x = body[0].find_all('p')
    
    # combine paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        paragraph = paragraph.replace('\n',"")
        list_paragraphs.append(paragraph)
        
        # removing copyright info and newsletter junk from the article
        final_article = " ".join(list_paragraphs)
        final_article = final_article.replace("This material may not be published, broadcast, rewritten, or redistributed. ©2020 FOX News Network, LLC. All rights reserved. All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("This material may not be published, broadcast, rewritten,", " ")
        final_article = final_article.replace("or redistributed. ©2020 FOX News Network, LLC. All rights reserved.", " ")
        final_article = final_article.replace("All market data delayed 20 minutes.", " ")
        final_article = final_article.replace("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!", " ")
    fox_text.append(final_article)

In [9]:
# join fox data
fox_data = pd.DataFrame.from_dict({
    'publisher': 'Fox',
    'date': fox_dates,
    'link': fox_links,
    'article_title': fox_titles,
    'article_text': fox_text 
})

fox_data.head()

Unnamed: 0,publisher,date,link,article_title,article_text
0,Fox,2020-03-07T154057-0500,https://foxnews.com/politics/pence-coronavirus...,Pence calls for 'whole of America' ap...,VP Pence meets with ...
1,Fox,2020-03-06T211054-0500,https://foxnews.com/politics/house-prepares-te...,House prepares for telework scenarios...,Fox News Flash top h...
2,Fox,2020-03-07T131303-0500,https://foxnews.com/politics/sanders-biden-ent...,Sanders says Biden can’t ‘generate en...,Former Vice Presiden...
3,Fox,2020-03-07T154057-0500,https://foxnews.com/politics/pence-coronavirus...,Pence calls for 'whole of America' ap...,VP Pence meets with ...
4,Fox,2020-03-07T131303-0500,https://foxnews.com/politics/sanders-biden-ent...,Sanders says Biden can’t ‘generate en...,Former Vice Presiden...


In [12]:
fox_data.to_csv("data/fox_data.csv", index = False)

In [208]:
# read in old data
old_fox_data = pd.read_csv('data/fox_data.csv')

# append new data
fox_data = old_fox_data.append(fox_data).drop_duplicates()

# save new .csv
fox_data.to_csv("data/fox_data.csv", index = False)

### 3. Wall Street Journal - Neutral