1. Make smaller dataset with full shows
2. Run Kmeans clustering
3. Make model based on Kmeans clustering
4. Run model on snippets
5. Run LDA
6. Identify nearby words
7. Do hand-coding
8. Do statistical analysis on added features

In [1]:
import pandas as pd
import pickle
import re
import numpy as np

from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize  
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [2]:
fox_shows = pd.read_csv('../data/raw/search-fox-last-year.csv')
fox_shows.head()

Unnamed: 0,identifier
0,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...
1,FOXNEWSW_20190528_230000_The_Story_With_Martha...
2,FOXNEWSW_20190528_040000_Tucker_Carlson_Tonigh...
3,FOXNEWSW_20190528_080000_Fox_and_Friends_First
4,FOXNEWSW_20190528_160000_Outnumbered


In [3]:
fox_shows, _ = train_test_split(fox_shows, test_size=0.9, random_state=18)
len(fox_shows)

406

In [6]:
fox_df = pd.read_csv('../data/interim/fox-last-year-sent-comb.csv').dropna()
fox_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sentence,start_snip,end_snip,contributor,runtime,start_time,stop_time,identifier,subjects
0,0,0,week.,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
1,1,1,tucker: tune in every night to the sworn enem...,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
2,2,2,have a great memorial day evening. see you tom...,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
3,3,4,"sean: looking to the special edition of ""ha...",0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
4,4,5,let's go to a flashback.,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."


In [7]:
fox_df = fox_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [8]:
# get only fox shows from show-based train_test_split
fox_df = fox_df.set_index('identifier').join(fox_shows.set_index('identifier'), on='identifier', how='inner')
len(fox_df)

254976

In [9]:
fox_df = fox_df.reset_index()
fox_df.head()

Unnamed: 0,identifier,sentence,start_snip,end_snip,contributor,runtime,start_time,stop_time,subjects
0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,harris: it hasn't been a busy hour.,0,60,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr..."
1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,glad you are long.,0,60,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr..."
2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,got to see air force one with the president co...,0,60,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr..."
3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,keep watching fox news. here is dana.,0,60,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr..."
4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,dana: your daily briefing starts now.,0,60,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr..."


In [10]:
fox_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254976 entries, 0 to 254975
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   identifier   254976 non-null  object
 1   sentence     254976 non-null  object
 2   start_snip   254976 non-null  int64 
 3   end_snip     254976 non-null  int64 
 4   contributor  254976 non-null  object
 5   runtime      254976 non-null  object
 6   start_time   254976 non-null  object
 7   stop_time    254976 non-null  object
 8   subjects     254976 non-null  object
dtypes: int64(2), object(7)
memory usage: 17.5+ MB


## 2. Run Kmeans Clustering ##

Run Kmeans clustering on sentences and determine which clusters are ads, which are news, and which are mixed.

In [11]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could',
                           '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many',
                           'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily',
                           'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right',
                           'line', 'even', 'also', 'may', 'take', 'come', 'hi', 'ha', 'le', 'u', 'wa', 'thi',
                           'to', 'one'])

In [12]:
def clean_sent(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("([\d,\,\./!#$%&\'\":;>\?@\[\]`)(\+])+", "", sent) # remove digits and remove punctuation
        sent = re.sub("([-])+", " ", sent)
        yield(sent)  

In [13]:
corpus = list(clean_sent(fox_df.sentence.values.tolist()))
corpus[:5]

[' harris it hasnt been a busy hour',
 'glad you are long',
 'got to see air force one with the president coming home from japan',
 'keep watching fox news here is dana',
 ' dana your daily briefing starts now']

In [14]:
#lemmatize before vectorizing

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), strip_accents='unicode', stop_words='english', 
                       min_df=2, max_df=0.3, ngram_range=(2,2))

In [15]:
corpus = list(clean_sent(fox_df.sentence.values.tolist()))
corpus[:5]

[' harris it hasnt been a busy hour',
 'glad you are long',
 'got to see air force one with the president coming home from japan',
 'keep watching fox news here is dana',
 ' dana your daily briefing starts now']

In [16]:
fox_bow = vect.fit_transform(corpus)
vect.get_feature_names()



['* high',
 'TM lens',
 '\\m m',
 'aa battery',
 'aa meeting',
 'aag best',
 'aag introducing',
 'aag leader',
 'aag receive',
 'aag retirement',
 'aag trust',
 'aag working',
 'aags free',
 'aags new',
 'aaron calvin',
 'aaron neville',
 'aaron zebley',
 'aarp advocate',
 'aarp auto',
 'aarp ha',
 'aarp medicare',
 'aarp member',
 'aba evaluation',
 'aba said',
 'abaco grand',
 'abaco island',
 'abandon ally',
 'abandon dangerous',
 'abandon ficial',
 'abandon kurd',
 'abandon kurdish',
 'abandon party',
 'abandon president',
 'abandon pretense',
 'abandon pursuit',
 'abandoned certain',
 'abandoning kurd',
 'abbey president',
 'abbey signing',
 'abbott getting',
 'abbott just',
 'abbott laboratory',
 'abbott releasing',
 'abbott said',
 'abbott signing',
 'abbott technology',
 'abbott test',
 'abbott texas',
 'abby hornacek',
 'abc anchor',
 'abc cbs',
 'abc didnt',
 'abc editorial',
 'abc epstein',
 'abc fake',
 'abc footage',
 'abc good',
 'abc interview',
 'abc nbc',
 'abc news',


In [17]:
kmeans = KMeans(n_clusters=75, random_state=18)
results = kmeans.fit_predict(fox_bow)

In [18]:
#print out most indicative words
terms = vect.get_feature_names()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(75):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print (' %s' % terms[ind])
    print()

Cluster 0:
 youve got
 dad youve
 got allstate
 wayfair shop
 thats youve
 got wayfair
 raising rate
 record liberty
 grudge raising
 rate mistake

Cluster 1:
 join u
 u live
 u tonight
 jillian join
 u phone
 jenkins join
 going join
 griff jenkins
 good morning
 live washington

Cluster 2:
 united state
 joe biden
 white house
 fox news
 dont think
 bernie sander
 good morning
 little bit
 im going
 year old

Cluster 3:
 american people
 think american
 people want
 work american
 people know
 want american
 people deserve
 fair trial
 people need
 people hear

Cluster 4:
 quick break
 going quick
 response quick
 democratic response
 whistling whistling
 martha quick
 break right
 break thank
 rose garden
 break watch

Cluster 5:
 tide power
 power pod
 new tide
 power liquid
 introducing new
 liquid new
 clean situation
 pod clean
 pod cat
 introducing tide

Cluster 6:
 look different
 world look
 power pain
 pain world
 advil power
 acting power
 fast acting
 gel fast
 advil liqui

 case tell

Cluster 73:
 people achieved
 skin month
 achieved clearer
 clearer skin
 just dos
 month just
 greg lot
 greg music
 greg medium
 greg mean

Cluster 74:
 whats easy
 right whats
 doing whats
 whats right
 force veteran
 veteran doing
 air force
 wean air
 greg love
 greg music



In [19]:
fox_df['cluster'] = results

In [20]:
# view sentences for each cluster
file_contents = ''
for i in range(75):
    file_contents += 'Cluster {}\n'.format(i)
    counter = 0
    for index, row in fox_df[fox_df.cluster == i].iterrows():
        file_contents += row['sentence'] + '\n'
        counter += 1
        if counter > 20:
            break
    file_contents += '\n'
with open('../data/interim/fox-sentence-check-2.txt', 'w') as f:
    f.write(file_contents)

In [21]:
# count number of sentences in each cluster
fox_df[['cluster', 'sentence']].groupby('cluster').count().sort_values(by='sentence', ascending=False)

Unnamed: 0_level_0,sentence
cluster,Unnamed: 1_level_1
2,240117
20,3637
46,1981
19,1544
62,1249
...,...
73,4
69,3
51,3
56,2


In [22]:
ad_clusters=[5, 6, 9, 10, 11, 13, 17, 24, 27, 29, 32, 33, 37, 41, 47, 49, 52, 54, 55, 59,
            60, 63, 65, 68, 71, 73, 74]
news_clusters=[1, 4, 7, 8, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 26, 28, 31, 34, 38, 40,
              42, 43, 44, 45, 46, 48, 40, 51, 56, 57, 58, 61, 62, 64, 66, 69, 70, 72]
mixed=[0, 2, 3, 25, 30, 35, 36, 39, 53, 67]
fox_df['ad_cluster'] = 0
fox_df['news_cluster'] = 0
fox_df['ad_cluster'] = fox_df['cluster'].isin(ad_clusters)
fox_df['news_cluster'] = fox_df['cluster'].isin(news_clusters)
fox_df = fox_df.mask(fox_df == True, 1)
fox_df = fox_df.mask(fox_df == False, 0)
fox_df[['news_cluster', 'sentence']].groupby('news_cluster').count()

Unnamed: 0_level_0,sentence
news_cluster,Unnamed: 1_level_1
0,243295
1,11681


# 3. Create First-Pass Model

Using the clustering, I now create a LogisticRegression model as a first pass model, to help create more features.

In [23]:
#create a dataframe of only the sentences that are definitively ads or news
coded_df = fox_df[(fox_df.ad_cluster == 1) | (fox_df.news_cluster == 1)]
len(coded_df)

13058

In [24]:
coded_df = coded_df.astype({'ad_cluster': 'int32', 'news_cluster': 'int32'})
coded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13058 entries, 5 to 254968
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   identifier    13058 non-null  object
 1   sentence      13058 non-null  object
 2   start_snip    13058 non-null  int64 
 3   end_snip      13058 non-null  int64 
 4   contributor   13058 non-null  object
 5   runtime       13058 non-null  object
 6   start_time    13058 non-null  object
 7   stop_time     13058 non-null  object
 8   subjects      13058 non-null  object
 9   cluster       13058 non-null  int32 
 10  ad_cluster    13058 non-null  int32 
 11  news_cluster  13058 non-null  int32 
dtypes: int32(3), int64(2), object(7)
memory usage: 1.1+ MB


In [25]:
# create a new vectorizer with 1 and 2-gramms
vect_2 = TfidfVectorizer(tokenizer=LemmaTokenizer(), strip_accents='unicode', stop_words='english', 
                       min_df=2, max_df=0.3, ngram_range=(1,2))

In [26]:
y = np.array(coded_df.ad_cluster)
X_corpus = np.array(coded_df['sentence'])
X_corpus_train, X_corpus_test, y_train, y_test = train_test_split(X_corpus, y, test_size=0.3, random_state=18)

X_train = vect_2.fit_transform(X_corpus_train)

X_test = vect_2.transform(X_corpus_test)

log = LogisticRegression()
log.fit(X_train, y_train)
log_score = log.score(X_test, y_test)
print('Logistic Regression Score: {}'.format(log_score))



Logistic Regression Score: 0.9959162838182747


In [27]:
pred = log.predict(X_test)
c=confusion_matrix(y_test, pred)
print(c)

[[3518    0]
 [  16  384]]


In [28]:
fox_df.to_csv('../data/interim/fox-sentences-clustered.csv')

In [29]:
pickle.dump(log, open('../models/fox_1st_pass_logit.p', 'wb'))

# 4. Apply Logit Model to Snippets

Applying the Logit model to snippets and then breaking the snippets into sentences helps identify snippets with ads in them, and since ads are likely to be next to each other, the output of this model is a useful feature

In [30]:
fox_snips = pd.read_csv('../data/interim/fox-last-year-parsed.csv')
len(fox_snips)

295667

In [31]:
# get only fox shows from show-based train_test_split
fox_snips = fox_snips.set_index('identifier').join(fox_shows.set_index('identifier'), on='identifier', how='inner').reset_index().copy()
len(fox_snips)

30478

In [32]:
fox_snips = fox_snips.dropna()
X_snips = np.array(fox_snips['snippet'])
X_snips_bow = vect_2.transform(X_snips)
pred = log.predict(X_snips_bow)
fox_snips['snip_ad'] = pred
fox_snips.head()

Unnamed: 0.1,identifier,Unnamed: 0,start_snip,end_snip,snippet,contributor,runtime,start_time,stop_time,subjects,show_name,snip_ad
0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,305,0,60,>> harris: it hasn't been a busy hour. glad yo...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",The_Daily_Briefing_With_Dana_Perino,0
1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,306,60,120,"in arkansas and oklahoma, as the arkansas rive...",FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",The_Daily_Briefing_With_Dana_Perino,0
2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,307,120,180,that's almost entirely submerged. if you look ...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",The_Daily_Briefing_With_Dana_Perino,0
3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,308,180,240,the national guard assisting with sandbags and...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",The_Daily_Briefing_With_Dana_Perino,0
4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,309,240,300,the question is when will the central states g...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",The_Daily_Briefing_With_Dana_Perino,0


In [33]:
fox_df = fox_df.set_index(['identifier', 'start_snip', 'end_snip'])
fox_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,ad_cluster,news_cluster
identifier,start_snip,end_snip,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FOXNEWSW_20190528_180000_The_Daily_Briefing_With_Dana_Perino,0,60,harris: it hasn't been a busy hour.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0
FOXNEWSW_20190528_180000_The_Daily_Briefing_With_Dana_Perino,0,60,glad you are long.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0
FOXNEWSW_20190528_180000_The_Daily_Briefing_With_Dana_Perino,0,60,got to see air force one with the president co...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0
FOXNEWSW_20190528_180000_The_Daily_Briefing_With_Dana_Perino,0,60,keep watching fox news. here is dana.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0
FOXNEWSW_20190528_180000_The_Daily_Briefing_With_Dana_Perino,0,60,dana: your daily briefing starts now.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0


In [34]:
fox_snips = fox_snips[['identifier', 'start_snip', 'end_snip', 'snip_ad']]

In [35]:
fox_snips = fox_snips.set_index(['identifier', 'start_snip', 'end_snip'])

In [36]:
fox_df = fox_df.join(fox_snips, on=['identifier', 'start_snip', 'end_snip'], how='inner')

# 5. Apply LDA

Perhaps LDA percentages will be a feature than can predict ads.

In [37]:
X_corpus = list(clean_sent(fox_df['sentence']))
X_corpus[:5]

[' harris it hasnt been a busy hour',
 'glad you are long',
 'got to see air force one with the president coming home from japan',
 'keep watching fox news here is dana',
 ' dana your daily briefing starts now']

In [38]:
X = vect.fit_transform(X_corpus)

In [39]:
# Tweak the two parameters below
number_topics = 75
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_results = lda.fit_transform(X)

In [40]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [41]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, vect, number_words)

Topics found via LDA:

Topic #0:
easy awesome simple easy thats simple president doe going pay fda approved im hemmer la vega friday night tomorrow morning

Topic #1:
hong kong big deal make decision ha lot lindsey graham theyre doing jerry nadler health organization world health mitt romney

Topic #2:
god bless gram sugar lose weight max protein ensure max protein gram liberty liberty announcer lose thank god pain swelling

Topic #3:
stay home glad help think important did wrong thanks u home order good reason people people happy birthday president trump

Topic #4:
minute hour impeachable offense year ago thats important sit baa eastern time hillary clinton roger stone jeffrey epstein state ha

Topic #5:
sleep number smart bed starting checked checked tuberculosis number smart dont worry dana think love country number setting auto insurance

Topic #6:
quid pro pro quo thank having u live otezla cream bernie sander oh oh join u doctor monitor stop treatment

Topic #7:
thats going healt

join u stock market thats thing think good clear skin doesnt make thing thats insurance company say oh know right

Topic #62:
presidential candidate let bring hard work south carolina democratic presidential did happen president trump party ha telling u news conference

Topic #63:
want ask doing great thats just win election half million getting better mental illness youre going ill pas want come

Topic #64:
president trump good afternoon climate change theyve got trump said thats want trump did thats coming daily briefing quite bit

Topic #65:
american people talking point kind like president told dont say life better great night did did martha think jacqui heinrich

Topic #66:
help u ive heard did come peter strzok defense team ingraham angle department justice december th im worried grand jury

Topic #67:
background check year later moving forward week ago thats lot great tonight know thing thats question hard believe got right

Topic #68:
think wa new orleans think thats say thing 

In [44]:
fox_df = fox_df.reset_index(level=['identifier', 'start_snip', 'end_snip'])
fox_df.head()

Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,ad_cluster,news_cluster,snip_ad
0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,harris: it hasn't been a busy hour.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0,0
1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,glad you are long.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0,0
2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,got to see air force one with the president co...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0,0
3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,keep watching fox news. here is dana.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0,0
4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,dana: your daily briefing starts now.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,0,0,0


In [45]:
lda_df = pd.DataFrame(lda_results)
lda_df.columns = ['topic_' + str(i) for i in range(75)]
fox_df = fox_df.join(lda_df, how='inner')
fox_df.head()

Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,harris: it hasn't been a busy hour.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,...,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525
1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,glad you are long.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,got to see air force one with the president co...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,...,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882
3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,keep watching fox news. here is dana.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,...,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601
4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,dana: your daily briefing starts now.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",2,...,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539


In [46]:
fox_df.to_csv('../data/interim/fox-sentences-clustered-topics.csv')

# 6. Add Features #

Now I will add calculated features to the dataframe to help diagnose blocks of ads, or words that may prefigure ads.

In [47]:
fox_df = pd.read_csv('../data/interim/fox-sentences-clustered-topics.csv')
len_df = len(fox_df)

ad_next_words = ['back', 'return', 'ahead', 'go away', 'next', 'miss', 'after this', 'tuned', 'applause', 'appreciate']
ad_prev_words = ['back', 'welcome', 'talk', 'applause', 'good evening', 'good morning', 'appreciate']

fox_df.head()

Unnamed: 0.1,Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,harris: it hasn't been a busy hour.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525,0.005525
1,1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,glad you are long.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
2,2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,got to see air force one with the president co...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882,0.003882
3,3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,keep watching fox news. here is dana.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601,0.005601
4,4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,dana: your daily briefing starts now.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539,0.005539


In [48]:
for i, row in fox_df.iterrows():
    sentence = row['sentence']
    if (i < len_df - 1):
        for word in ad_next_words:
            if (word in sentence):
                fox_df.loc[i + 1,'has_prev_' + word] = 1
                if (i + 2) < len_df:
                    fox_df.loc[i + 2,'has_prev_' + word] = 1
                if (i + 3) < len_df:
                    fox_df.loc[i + 2,'has_prev_' + word] = 1
    if (i > 0):
        for word in ad_prev_words:
            if (word in sentence):
                fox_df.loc[i - 1, 'has_next_' + word] = 1
                if (i - 2) > 0:
                    fox_df.loc[i - 2, 'has_next_' + word] = 1
                if (i - 3) > 0:
                    fox_df.loc[i - 3, 'has_next_' + word] = 1


In [49]:
fox_df = fox_df.fillna(0)
fox_df.head()

Unnamed: 0.1,Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,...,has_next_back,has_prev_ahead,has_next_good evening,has_next_welcome,has_prev_after this,has_next_good morning,has_prev_applause,has_next_applause,has_prev_tuned,has_prev_go away
0,0,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,harris: it hasn't been a busy hour.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,glad you are long.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,got to see air force one with the president co...,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,keep watching fox news. here is dana.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,FOXNEWSW_20190528_180000_The_Daily_Briefing_Wi...,0,60,dana: your daily briefing starts now.,FOXNEWSW,01:00:58,2019-05-28 18:00:00,2019-05-28 19:00:59,"['dana', 'biden', 'joe biden', 'elizabeth warr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
fox_df.head(10000).to_csv('../data/interim/fox_ready_to_code.csv')

In [51]:
fox_df.iloc[10000:].to_csv('../data/interim/fox_to_be_tested.csv')