## 1.0 Atlantic Scraper

This notebook contains a scraper for online news articles from the Atlantic. It scrapes articles from the newest to oldest in a given page range, omiting any previously scraped articles.

In [None]:
import os
import requests
import pickle as pkl
import time
from datetime import date
from datetime import datetime

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
def get_date(my_article):
    """Return date from article in yyyy-mm-dd format or yyyy-mm, unless it is from an issue"""
    article_date = '?'
    if my_article.find('time').text is not None:        
        soup_date = my_article.find('time').text
        if 'ET' in soup_date:
            article_date = date.today()
        else:
            date_string = soup_date.replace('\n', '').strip()
            article_date = datetime.strptime(date_string, '%B %d, %Y')
    
    return article_date

In [None]:
def get_body(my_soup):
    """Returns the body text of a web page from the page soup"""
    body_text = ''
    
    i = 0
    while True: 
        paragraph = my_soup.find('section', {'id': 'article-section-' + str(i)})
        if paragraph is not None:
            body_text += paragraph.text
            i += 1
        elif i==0:
            i += 1
            continue
        else:
            break

    return body_text    

In [None]:
def get_category(my_soup):
    "Return the article type for an article webpage in Atlantic"
    if my_soup.find('a', {'class': 'c-rubric__link'}) is not None:
        category = my_soup.find('a', {'class': 'c-rubric__link'}).text.strip()
    elif my_soup.find('a', {'class': 'rubric'}) is not None:
        category = my_soup.find('a', {'class': 'rubric'}).text.strip()
    else:
        category = '?'
    return category

In [None]:
def file_name(my_date):
    """Generate a file name for a particular date that is in datetime format"""
    date_str = datetime.strftime(my_date, '%B %d, %Y')
    date_list = date_str.split()
    file_name = '{}_{}_{}'.format(date_list[0], date_list[1], date_list[2])
    return file_name

Using all the functions above to scrape the articles:

In [None]:
data_dict = {'date': [], 'category': [], 'title': [], 'subtitle': [],
             'author': [], 'author_bio': [], 'text': [],  'url': [], 'time_scraped': []}    
        
for i in range(1, 100):    
    page_url = 'https://www.theatlantic.com/latest/?page=' + str(i) 
    #a page of "Latest": links to articles
    page = requests.get(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    articles = soup.find_all('li', {'class': 'article blog-article'})
             
    for article in articles:
        
        article_date = get_date(article)
        filename = file_name(article_date)
        
        filepath = 'data/raw/{}.pkl'.format(filename)
        
        #check if file exists already
        if os.path.isfile(filepath):
            continue          
        print('scraping from page {}'.format(i)) 
        
        #check if date is same or is first entry
        if len(data_dict['date']) > 0 and article_date != data_dict['date'][-1]:
            pkl_name = file_name(data_dict['date'][-1])
            pkl_filepath = 'data/raw/{}.pkl'.format(pkl_name)
                      
            with open(pkl_filepath, 'wb') as fp:
                pkl.dump(data_dict, fp)
                          
            data_dict = {'date': [], 'category': [], 'title': [], 'subtitle': [],
             'author': [], 'author_bio': [], 'text': [],  'url': [], 'time_scraped': []}
            
        #scraping
        
        data_dict['date'].append(article_date)

        a_element = article.find('a')
        url = 'https://www.theatlantic.com{}'.format(a_element.get('href'))
        data_dict['url'].append(url)  
        
        if article.find('li', {'class': 'byline'}) is not None:
            author = article.find('li', {'class': 'byline'}).text
        else:
            author = '?'
        data_dict['author'].append(author)
        
        if article.find('h2') is not None:
            title = article.find('h2').text.replace('\n', '').strip()
        else:
            title = '?'
        data_dict['title'].append(title)       
 
        if article.find('p', {'class':'dek has-dek'}) is not None: 
            subtitle = article.find('p', {'class':'dek has-dek'}).text.replace('\n', '').replace('  ', '')
        else:
            subtitle = '?'
        data_dict['subtitle'].append(subtitle)
        
        time.sleep(np.random.random() + 3)
        
        #going to the page for the article itself        
        artic_page = requests.get(url) 
        artic_soup = BeautifulSoup(artic_page.content, 'html.parser')
       
        category = get_category(artic_soup) 
        data_dict['category'].append(category)        
              
        body = get_body(artic_soup) 
        data_dict['text'].append(body)
        
        if artic_soup.find('div', {'class': 'c-article-writer__bio'}) is not None:
            author_bio = artic_soup.find('div', {'class': 'c-article-writer__bio'}).text.strip()
        else:
            author_bio = '?'
        data_dict['author_bio'].append(author_bio)
        
        scrape_time = datetime.now()
        data_dict['time_scraped'].append(scrape_time)
                          
        time.sleep(np.random.random() + 3)
    
    time.sleep(np.random.random() + 5)

The data scraped using this notebook will be used and anlyzed in the following notebooks in this repo: 2.0_atlantic_eda_cleaning.ipynb, 3.0_atlantic_sentiment_analysis.ipynb