In [87]:
import os
import datetime
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import mediacloud.api
import pandas as pd
from bs4 import BeautifulSoup

from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()


True

## Data specifications

The BuzzFeed layoffs were [announced](https://techcrunch.com/2019/01/23/buzzfeed-layoffs-2019/) on 2019/01/23. They [started](https://slate.com/technology/2019/01/buzzfeeds-layoffs-wont-kill-it-but-they-have-changed-it.html) on 2019/01/25 and continued into the beginning of the subsequent week, although many did not focus on the BuzzFeed News division.

Several considerations:

* The staggered nature of the layoffs could affect our ability to get a clean before/after break.
* We'll have to parse out which layoffs occurred on the news teams.
* Since the layoffs straddled a weekend, we'll want to normalize output to account for fluctuations in publishing volume by day of week.

### Date range
Let's start by getting two weeks before and after the layoffs were announced

In [80]:
def daterange(date, padding, service=None):
    """Get range of dates a certain time period around a date"""
    date_encoded = datetime.datetime.strptime(date, '%Y-%m-%d')
    start = date_encoded - datetime.timedelta(days = padding)
    end = date_encoded + datetime.timedelta(days = padding)
    
    date_range = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days+1)]
    date_range_encoded = [10000*dt_time.year + 100*dt_time.month + dt_time.day for dt_time in date_range]
    
    if service=='GDELT':
        return date_range_encoded
    elif service=='MC':
        return (start, end)

## Getting data from GDELT

In [40]:
GDELT_URL = 'http://data.gdeltproject.org/events/'

In [25]:
dates = daterange('2019-01-23', 14, 'GDELT')

In [31]:
uris = ['{}.export.CSV.zip'.format(i) for i in dates]
urls = ['{0}{1}'.format(GDELT_URL, i) for i in uris]

In [65]:
df_all = []

for i in tqdm(urls):
    page = urlopen(i)
    zipfile = ZipFile(BytesIO(page.read()))
    filename = zipfile.namelist()[0]
    df = pd.read_csv(zipfile.open(filename), sep='\t', header=None)
    bf_links = df[df[57].str.contains('www.buzzfeed')][57]
    df_filtered = pd.DataFrame({'urls': bf_links, 'date': filename})
    df_all.append(df_filtered)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:38<00:00,  3.41s/it]


In [73]:
pd.concat(df_all).to_csv('../data/GDELT_29days.csv')

## Getting data from Media Cloud

[API documentation](https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#grab-all-stories-in-the-new-york-times-during-october-2012)

Media Cloud doesn't seem to actually track BuzzFeed News stories :/

In [92]:
api_key = os.getenv("API_KEY")
mc = mediacloud.api.MediaCloud(api_key)

In [91]:
dates = daterange('2019-01-23', 14, 'MC')

In [96]:
stories = mc.storyList('media_id:6218')