In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.parser import parse
import warnings

warnings.simplefilter(action='ignore')

In [2]:
# load all data sets
breitbart = pd.read_csv('data/breitbart_data.csv')
fox = pd.read_csv('data/fox_data.csv')
wt = pd.read_csv('data/wt_data.csv')
ap = pd.read_csv('data/ap_data.csv')
nbc = pd.read_csv('data/nbc_data.csv')
nyt = pd.read_csv('data/nyt_data.csv')
politico = pd.read_csv('data/politico_data.csv')
buzzfeed = pd.read_csv('data/buzzfeed_data.csv')

#### Make dates comparable

In [3]:
fox['date'] = [x.split('T')[0] for x in fox['date']]

In [4]:
wt['date'] = [x.replace(' -\n\t\t\t\n\t\t\t\tAssociated Press\n -    Updated:', '') for x in wt['date']]
wt['date'] = [x.replace(' -\n\t\t\t\n\t\t\t\tThe Washington Times\n -    Updated:', '') for x in wt['date']]
wt['date'] = [parse(x) for x in wt['date']]

In [5]:
ap['date'] = [x.split('T')[0] for x in ap['date']]

In [6]:
nbc = nbc.copy().dropna()
nbc['date'] = [parse(x) for x in nbc['date']]

In [7]:
buzzfeed['date'] = [x.replace('Posted on ', '').replace('Last updated on ', '') for x in buzzfeed['date']]
buzzfeed['date'] = [x.strip() for x in buzzfeed['date']]
buzzfeed['date'] = [x.split(',')[0:2] for x in buzzfeed['date']]
buzzfeed['date'] = [''.join(x) for x in buzzfeed['date']]
buzzfeed['date'] = [parse(x) for x in buzzfeed['date']]

#### Merge

In [8]:
full_data = pd.concat([
    breitbart,
    fox,
    wt,
    ap,
    nbc,
    nyt,
    politico,
    buzzfeed
])

#### Identify Candidates

In [22]:
candidates = ['Trump', 'Bernie', 'Sanders', 'Biden', 'Warren', 'Buttigieg', 
              'Bloomberg', 'Klobuchar', 'Yang', 'Steyer', 'Gabbard']

In [10]:
for cand in candidates:
    full_data[cand] = pd.np.where(full_data['article_text'].str.contains(cand), 1, 0)

In [11]:
full_data['Sanders'] = pd.np.where(full_data['Bernie'] == 1, 1, full_data['Sanders'])
full_data = full_data.drop(columns = 'Bernie')

In [23]:
# remove 'Bernie' from candidates list before summing
candidates.remove('Bernie')

In [25]:
full_data['candidates_mentioned'] = full_data.loc[:, candidates].sum(axis = 1)
full_data = full_data[full_data['candidates_mentioned'] != 0]

#### Save

In [27]:
# read in old data
old_data = pd.read_csv('data/full_data.csv')
num_old = len(old_data)

# append new data
full_data = old_data.append(full_data).drop_duplicates()

# save new .csv
full_data.to_csv("data/full_data.csv", index = False)
num_now = len(full_data)

print("number of entries in old data: {}".format(num_old))
print("total number of entries in new data: {}".format(num_now))
print("difference: {}".format(num_now - num_old))

total number of entries in new data: 1784


#### Articles to sentences

In [28]:
# create article id #
data_for_sentences = full_data[['article_text', 'article_title', 'date', 'link', 'publisher']].copy()
data_for_sentences = data_for_sentences.reset_index()
data_for_sentences = data_for_sentences.reset_index().rename(columns = {'level_0': 'article_id'}).drop(columns = 'index')

In [29]:
# split article text to sentences
sentences = data_for_sentences['article_text'].copy().str.split('.').apply(pd.Series, 1).stack()

In [30]:
# add correct article id # to each sentence
sentences.index.droplevel(-1) 
sentences.name = 'article_text'
sentences = sentences.reset_index().drop(columns = 'level_1').rename(columns = {'level_0': 'article_id'})

In [31]:
# drop paragraph-form article text
data_for_sentences = data_for_sentences.drop(columns = 'article_text')

In [32]:
# merge sentence article text
sentence_data = data_for_sentences.merge(sentences, how='left', on='article_id')

In [33]:
# clean up
mask = sentence_data['article_text'].astype(str).str.len() < 15
sentence_data.loc[mask, 'article_text'] = ''
sentence_data = sentence_data[(sentence_data['article_text'] != '')]

#### Identify Candidates

In [36]:
candidates_sent = ['Trump', 'Bernie', 'Sanders', 'Biden', 'Warren', 'Buttigieg', 
                   'Bloomberg', 'Klobuchar', 'Yang', 'Steyer', 'Gabbard']

In [37]:
for cand in candidates_sent:
    sentence_data[cand] = pd.np.where(sentence_data['article_text'].str.contains(cand), 1, 0)

In [39]:
sentence_data['Sanders'] = pd.np.where(sentence_data['Bernie'] == 1, 1, sentence_data['Sanders'])
sentence_data = sentence_data.drop(columns = 'Bernie')

In [44]:
# remove 'Bernie' from candidates list to create sum
candidates_sent.remove('Bernie')

In [45]:
# drop unnecessary rows
sentence_data['candidates_mentioned'] = sentence_data.loc[:, candidates_sent].sum(axis = 1)
sentence_data = sentence_data[sentence_data['candidates_mentioned'] != 0]

#### Save

In [47]:
# read in old data
old_sentence_data = pd.read_csv('data/sentence_data.csv')
num_sentence_old = len(old_sentence_data)

# append new data
sentence_data = old_sentence_data.append(sentence_data).drop_duplicates()

# save new .csv
sentence_data.to_csv("data/sentence_data.csv", index = False)
num_sentence_now = len(sentence_data)

print("number of entries in old data: {}".format(num_sentence_old))
print("total number of entries in new data: {}".format(num_sentence_now))
print("difference: {}".format(num_sentence_now - num_sentence_old))

total number of entries in new data: 18462
