In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.parser import parse

In [2]:
# load all data sets
breitbart = pd.read_csv('data/breitbart_data.csv')
fox = pd.read_csv('data/fox_data.csv')
wt = pd.read_csv('data/wt_data.csv')
ap = pd.read_csv('data/ap_data.csv')
nbc = pd.read_csv('data/nbc_data.csv')
nyt = pd.read_csv('data/nyt_data.csv')
politico = pd.read_csv('data/politico_data.csv')
buzzfeed = pd.read_csv('data/buzzfeed_data.csv')

#### Make dates comparable

In [3]:
fox['date'] = [x.split('T')[0] for x in fox['date']]

In [8]:
wt['date'] = [x.replace(' -\n\t\t\t\n\t\t\t\tAssociated Press\n -    Updated:', '') for x in wt['date']]
wt['date'] = [x.replace(' -\n\t\t\t\n\t\t\t\tThe Washington Times\n -    Updated:', '') for x in wt['date']]
wt['date'] = [parse(x) for x in wt['date']]

In [10]:
ap['date'] = [x.split('T')[0] for x in ap['date']]

In [11]:
nbc = nbc.copy().dropna()
nbc['date'] = [parse(x) for x in nbc['date']]

In [12]:
buzzfeed['date'] = [x.replace('Posted on ', '').replace('Last updated on ', '') for x in buzzfeed['date']]
buzzfeed['date'] = [x.strip() for x in buzzfeed['date']]
buzzfeed['date'] = [x.split(',')[0:2] for x in buzzfeed['date']]
buzzfeed['date'] = [''.join(x) for x in buzzfeed['date']]
buzzfeed['date'] = [parse(x) for x in buzzfeed['date']]

#### Merge

In [144]:
full_data = pd.concat([
    breitbart,
    fox,
    wt,
    ap,
    nbc,
    nyt,
    politico,
    buzzfeed
])

#### Articles to sentences

In [145]:
# create article id #
full_data = full_data.rename_axis('article_id').reset_index()

In [146]:
# split article text to sentences
sentences = full_data['article_text'].copy().str.split('.').apply(pd.Series, 1).stack()

In [147]:
# add correct article id # to each sentence
sentences.index.droplevel(-1) 
sentences.name = 'article_text'
sentences = sentences.reset_index().drop(columns = 'level_1').rename(columns = {'level_0': 'article_id'})

In [148]:
# create new data set without original paragraph-form article text
sentence_data = full_data.copy()
del sentence_data['article_text']

In [149]:
# merge sentence article text
sentence_data = sentence_data.merge(sentences, how='left', on='article_id')

In [150]:
# clean up
final_sentence_data = sentence_data.copy()
mask = final_sentence_data['article_text'].astype(str).str.len() < 15
final_sentence_data.loc[mask, 'article_text'] = ''
final_sentence_data = final_sentence_data[(final_sentence_data['article_text'] != '')]