In [10]:
import os
import datetime
import json
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import requests
import mediacloud.api
import pandas as pd
from bs4 import BeautifulSoup
from newsapi import NewsApiClient

from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()


True

## Data specifications

The BuzzFeed layoffs were [announced](https://techcrunch.com/2019/01/23/buzzfeed-layoffs-2019/) on 2019/01/23. They [started](https://slate.com/technology/2019/01/buzzfeeds-layoffs-wont-kill-it-but-they-have-changed-it.html) on 2019/01/25 and continued into the beginning of the subsequent week, although many did not focus on the BuzzFeed News division.

Several considerations:

* The staggered nature of the layoffs could affect our ability to get a clean before/after break.
* We'll have to parse out which layoffs occurred on the news teams.
* Since the layoffs straddled a weekend, we'll want to normalize output to account for fluctuations in publishing volume by day of week.

### Date range
Let's start by getting two weeks before and after the layoffs were announced

In [3]:
def daterange(date, padding, service=None):
    """Get range of dates a certain time period around a date"""
    date_encoded = datetime.datetime.strptime(date, '%Y-%m-%d')
    start = date_encoded - datetime.timedelta(days = padding)
    end = date_encoded + datetime.timedelta(days = padding)
    
    date_range = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days+1)]
    date_range_encoded = [10000*dt_time.year + 100*dt_time.month + dt_time.day for dt_time in date_range]
    
    if service=='GDELT':
        return date_range_encoded
    elif service=='MC':
        return (start, end)
    elif service=='NEWS':
        return (start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'))
    elif service=='ARCHIVE':
        return (date_range_encoded[0], date_range_encoded[-1])

## Getting data from GDELT

In [40]:
GDELT_URL = 'http://data.gdeltproject.org/events/'

In [25]:
dates = daterange('2019-01-23', 14, 'GDELT')

In [31]:
uris = ['{}.export.CSV.zip'.format(i) for i in dates]
urls = ['{0}{1}'.format(GDELT_URL, i) for i in uris]

In [65]:
df_all = []

for i in tqdm(urls):
    page = urlopen(i)
    zipfile = ZipFile(BytesIO(page.read()))
    filename = zipfile.namelist()[0]
    df = pd.read_csv(zipfile.open(filename), sep='\t', header=None)
    bf_links = df[df[57].str.contains('www.buzzfeed')][57]
    df_filtered = pd.DataFrame({'urls': bf_links, 'date': filename})
    df_all.append(df_filtered)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:38<00:00,  3.41s/it]


In [73]:
pd.concat(df_all).to_csv('../data/GDELT_29days.csv')

## Getting data from Media Cloud

[API documentation](https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#grab-all-stories-in-the-new-york-times-during-october-2012)

Media Cloud doesn't seem to actually track BuzzFeed News stories :/

In [92]:
api_key_mc = os.getenv("API_KEY_MC")
mc = mediacloud.api.MediaCloud(api_key_mc)

In [91]:
dates = daterange('2019-01-23', 14, 'MC')

In [96]:
stories = mc.storyList('media_id:6218')

## Getting data from News API
[Python client documentation](https://newsapi.org/docs/client-libraries/python)

Time period too far back for the free plan :/

In [99]:
api_key_news = os.getenv("API_KEY_NEWS")
newsapi = NewsApiClient(api_key=api_key_news)

In [107]:
dates = daterange('2019-01-23', 14, 'NEWS')

In [114]:
all_articles = newsapi.get_everything(domains='buzzfeednews.com',
                                      from_param=dates[0],
                                      to=dates[1])

NewsAPIException: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2019-12-09, but you have requested 2019-01-09. To extend this please upgrade to a paid plan.'}

## Getting data from the Internet Archive

[CDX server API](https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#basic-usage)

[Paginated API](https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#pagination-api)

In [5]:
dates = daterange('2019-01-23', 14, 'ARCHIVE')

In [7]:
URL = 'https://web.archive.org/cdx/search/cdx?url=buzzfeednews.com&matchType=host&from={0}&to={0}&output=json'.format(dates[0], dates[1])

In [11]:
results = requests.get(URL).json()

In [14]:
col_names = results[0]

In [16]:
urls = []
for i in results[1:]:
    urls.append(i[2])

In [19]:
article_urls = [i for i in urls if 'article' in i]

In [28]:
filtered_article_urls = list(set([i.split('?')[0] if '?' in i else i for i in article_urls 
     if '/js/' not in i and 'x0.25' not in i and '/track/' not in i and '/v2.9' not in i]))

In [30]:
pd.DataFrame({'urls': filtered_article_urls}).to_csv('../data/ARCHIVE_29days.csv', index=False)