1. Make smaller dataset with full shows
2. Run Kmeans clustering
3. Make model based on Kmeans clustering
4. Run model on snippets
5. Run LDA
6. Identify nearby words
7. Do hand-coding
8. Do statistical analysis on added features

In [129]:
import pandas as pd
import pickle
import re
import numpy as np

from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize  
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [92]:
cnn_shows = pd.read_csv('../data/raw/search-cnn-last-year.csv')
cnn_shows.head()

Unnamed: 0,identifier
0,CNNW_20190528_170000_CNN_Right_Now_With_Briann...
1,CNNW_20190528_140000_CNN_Newsroom_with_Poppy_H...
2,CNNW_20190528_160000_Inside_Politics
3,CNNW_20190528_050000_CNN_Special_Report
4,CNNW_20190528_120000_New_Day_With_Alisyn_Camer...


In [93]:
cnn_shows, _ = train_test_split(cnn_shows, test_size=0.9, random_state=18)
len(cnn_shows)

541

In [94]:
cnn_df = pd.read_csv('../data/interim/cnn-last-year-sent-comb.csv').drop(columns=['Unnamed: 0', 
                                                 'Unnamed: 0.1',
                                                'Unnamed: 0.1.1']).dropna()
len(cnn_df)

3460457

In [95]:
# get only cnn shows from show-based train_test_split
cnn_df = cnn_df.set_index('identifier').join(cnn_shows.set_index('identifier'), on='identifier', how='inner')
len(cnn_df)

334992

In [96]:
cnn_df = cnn_df.reset_index()
cnn_df.head()

Unnamed: 0,identifier,sentence,start_snip,end_snip,contributor,runtime,start_time,stop_time,subjects
0,CNNW_20190528_060000_CNN_Newsroom_Live,"now the fastest, most reliable internet can he...",0,60,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ..."
1,CNNW_20190528_060000_CNN_Newsroom_Live,"that's simple, easy, awesome.",0,60,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ..."
2,CNNW_20190528_060000_CNN_Newsroom_Live,taxi!,0,60,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ..."
3,CNNW_20190528_060000_CNN_Newsroom_Live,should i have stopped her?,0,60,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ..."
4,CNNW_20190528_060000_CNN_Newsroom_Live,save hundreds of dollars a year when you get i...,0,60,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ..."


In [97]:
cnn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334992 entries, 0 to 334991
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   identifier   334992 non-null  object
 1   sentence     334992 non-null  object
 2   start_snip   334992 non-null  int64 
 3   end_snip     334992 non-null  int64 
 4   contributor  334992 non-null  object
 5   runtime      334992 non-null  object
 6   start_time   334992 non-null  object
 7   stop_time    334992 non-null  object
 8   subjects     334992 non-null  object
dtypes: int64(2), object(7)
memory usage: 23.0+ MB


## 2. Run Kmeans Clustering ##

Run Kmeans clustering on sentences and determine which clusters are ads, which are news, and which are mixed.

In [98]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could',
                           '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many',
                           'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily',
                           'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right',
                           'line', 'even', 'also', 'may', 'take', 'come', 'hi', 'ha', 'le', 'u', 'wa', 'thi',
                           'to', 'one'])

In [99]:
def clean_sent(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("([\d,\,\./!#$%&\'\":;>\?@\[\]`)(\+])+", "", sent) # remove digits and remove punctuation
        sent = re.sub("([-])+", " ", sent)
        yield(sent)  

In [100]:
corpus = list(clean_sent(cnn_df.sentence.values.tolist()))
corpus[:5]

['now the fastest most reliable internet can help you save on your wireless bill',
 'thats simple easy awesome',
 'taxi',
 'should i have stopped her',
 'save hundreds of dollars a year when you get internet and mobile together']

In [101]:
#lemmatize before vectorizing

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), strip_accents='unicode', stop_words='english', 
                       min_df=2, max_df=0.3, ngram_range=(2,2))

In [102]:
corpus = list(clean_sent(cnn_df.sentence.values.tolist()))
corpus[:5]

['now the fastest most reliable internet can help you save on your wireless bill',
 'thats simple easy awesome',
 'taxi',
 'should i have stopped her',
 'save hundreds of dollars a year when you get internet and mobile together']

In [103]:
cnn_bow = vect.fit_transform(corpus)
vect.get_feature_names()



['aa aa',
 'aa aye',
 'aa battery',
 'aag best',
 'aag introducing',
 'aag trust',
 'aag working',
 'aags free',
 'aags new',
 'aan estimated',
 'aaron burnett',
 'aaron dean',
 'aaron zebly',
 'aarp auto',
 'aarp endorsement',
 'aarp ha',
 'aarp high',
 'aarp medicare',
 'aarp meet',
 'aarp member',
 'aarp thats',
 'ab ab',
 'aba today',
 'abaco bahamas',
 'abaco grand',
 'abaco island',
 'abandon chair',
 'abandon trump',
 'abandoned abused',
 'abandoned ally',
 'abandoned effort',
 'abandoned house',
 'abandoned president',
 'abandoned turned',
 'abandoning ally',
 'abandonment kurd',
 'abbe confidentth',
 'abbott getting',
 'abbott helping',
 'abbott technology',
 'abbott test',
 'abbvie able',
 'abby philip',
 'abby phillip',
 'abby phillips',
 'abby thank',
 'abby u',
 'abby week',
 'abc didnt',
 'abc direction',
 'abc executive',
 'abc faith',
 'abc ha',
 'abc happy',
 'abc monday',
 'abc news',
 'abc newswashington',
 'abc wa',
 'abc wanted',
 'abc washington',
 'abcwashington 

In [104]:
kmeans = KMeans(n_clusters=75, random_state=18)
results = kmeans.fit_predict(cnn_bow)

In [105]:
#print out most indicative words
terms = vect.get_feature_names()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(75):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print (' %s' % terms[ind])
    print()

Cluster 0:
 operation place
 caucus dont
 vote caucus
 place drive
 ® vaccine
 gupta talk
 gust high
 gust causing
 gurgen joe
 gupta u

Cluster 1:
 president trump
 dont know
 united state
 new york
 donald trump
 joe biden
 im going
 bernie sander
 breaking news
 vice president

Cluster 2:
 cut half
 cost shipping
 shipping cut
 gupta professor
 gust high
 gust causing
 gurgen joe
 gupta u
 gupta thanks
 gupta thank

Cluster 3:
 andrew yang
 candidate andrew
 presidential candidate
 businessman andrew
 yang thanks
 democratic presidential
 yang u
 yang wife
 entrepreneur democratic
 joining entrepreneur

Cluster 4:
 year old
 wa year
 old wa
 wa accident
 old year
 old want
 old girl
 buy car
 want buy
 old daughter

Cluster 5:
 wa missing
 realized wa
 ulcerative colitising
 colitising realized
 severe ulcerative
 moderate severe
 roof wa
 night wa
 gupta professor
 gust causing

Cluster 6:
 wa executed
 really way
 adult life
 morning wa
 way wa
 ® vaccine
 gupta perspective
 gurge

In [106]:
cnn_df['cluster'] = results

In [107]:
# view sentences for each cluster
file_contents = ''
for i in range(75):
    file_contents += 'Cluster {}\n'.format(i)
    counter = 0
    for index, row in cnn_df[cnn_df.cluster == i].iterrows():
        file_contents += row['sentence'] + '\n'
        counter += 1
        if counter > 20:
            break
    file_contents += '\n'
with open('../data/interim/cnn-sentence-check-2.txt', 'w') as f:
    f.write(file_contents)

In [108]:
# count number of sentences in each cluster
cnn_df[['cluster', 'sentence']].groupby('cluster').count().sort_values(by='sentence', ascending=False)

Unnamed: 0_level_0,sentence
cluster,Unnamed: 1_level_1
1,325472
55,3195
41,905
4,560
30,527
...,...
32,4
52,3
9,2
71,2


In [109]:
ad_clusters=[2, 5, 7, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 27, 28, 29, 31, 33, 36,
             37, 38, 39, 42, 48, 49, 50, 53, 57, 58, 59, 60, 61, 63, 64, 66, 68, 69, 73, 74]
news_clusters=[0, 3, 6, 9, 10, 12, 16, 25, 26, 30, 32, 34, 35, 40, 41, 43, 44, 45, 46,
              47, 51, 54, 55, 56, 62, 65, 67, 70, 71, 72]
mixed=[1, 4, 8, 11, 14]
cnn_df['ad_cluster'] = 0
cnn_df['news_cluster'] = 0
cnn_df['ad_cluster'] = cnn_df['cluster'].isin(ad_clusters)
cnn_df['news_cluster'] = cnn_df['cluster'].isin(news_clusters)
cnn_df = cnn_df.mask(cnn_df == True, 1)
cnn_df = cnn_df.mask(cnn_df == False, 0)
cnn_df[['news_cluster', 'sentence']].groupby('news_cluster').count()

Unnamed: 0_level_0,sentence
news_cluster,Unnamed: 1_level_1
0,327394
1,7598


# 3. Create First-Pass Model

Using the clustering, I now create a LogisticRegression model as a first pass model, to help create more features.

In [110]:
#create a dataframe of only the sentences that are definitively ads or news
coded_df = cnn_df[(cnn_df.ad_cluster == 1) | (cnn_df.news_cluster == 1)]
len(coded_df)

8903

In [111]:
coded_df = coded_df.astype({'ad_cluster': 'int32', 'news_cluster': 'int32'})
coded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8903 entries, 27 to 334982
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   identifier    8903 non-null   object
 1   sentence      8903 non-null   object
 2   start_snip    8903 non-null   int64 
 3   end_snip      8903 non-null   int64 
 4   contributor   8903 non-null   object
 5   runtime       8903 non-null   object
 6   start_time    8903 non-null   object
 7   stop_time     8903 non-null   object
 8   subjects      8903 non-null   object
 9   cluster       8903 non-null   int32 
 10  ad_cluster    8903 non-null   int32 
 11  news_cluster  8903 non-null   int32 
dtypes: int32(3), int64(2), object(7)
memory usage: 799.9+ KB


In [112]:
# create a new vectorizer with 1 and 2-gramms
vect_2 = TfidfVectorizer(tokenizer=LemmaTokenizer(), strip_accents='unicode', stop_words='english', 
                       min_df=2, max_df=0.3, ngram_range=(1,2))

In [113]:
y = np.array(coded_df.ad_cluster)
X_corpus = np.array(coded_df['sentence'])
X_corpus_train, X_corpus_test, y_train, y_test = train_test_split(X_corpus, y, test_size=0.3, random_state=18)

X_train = vect_2.fit_transform(X_corpus_train)

X_test = vect_2.transform(X_corpus_test)

log = LogisticRegression()
log.fit(X_train, y_train)
log_score = log.score(X_test, y_test)
print('Logistic Regression Score: {}'.format(log_score))



Logistic Regression Score: 0.9981280419318608


In [114]:
pred = log.predict(X_test)
c=confusion_matrix(y_test, pred)
print(c)

[[2297    0]
 [   5  369]]


In [115]:
cnn_df.to_csv('../data/interim/cnn-sentences-clustered.csv')

In [117]:
pickle.dump(log, open('../models/cnn_1st_pass_logit.p', 'wb'))

# 4. Apply Logit Model to Snippets

Applying the Logit model to snippets and then breaking the snippets into sentences helps identify snippets with ads in them, and since ads are likely to be next to each other, the output of this model is a useful feature

In [118]:
cnn_snips = pd.read_csv('../data/interim/cnn-last-year-parsed.csv')
len(cnn_snips)

340147

In [119]:
# get only cnn shows from show-based train_test_split
cnn_snips = cnn_snips.set_index('identifier').join(cnn_shows.set_index('identifier'), on='identifier', how='inner').reset_index().copy()
len(cnn_snips)

33299

In [120]:
cnn_snips = cnn_snips.dropna()
X_snips = np.array(cnn_snips['snippet'])
X_snips_bow = vect_2.transform(X_snips)
pred = log.predict(X_snips_bow)
cnn_snips['snip_ad'] = pred
cnn_snips.head()

Unnamed: 0.1,identifier,Unnamed: 0,start_snip,end_snip,snippet,contributor,runtime,start_time,stop_time,subjects,show_name,snip_ad
0,CNNW_20190528_060000_CNN_Newsroom_Live,305,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",CNN_Newsroom_Live,0
1,CNNW_20190528_060000_CNN_Newsroom_Live,306,60,120,states and of course all around the world. i'm...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",CNN_Newsroom_Live,0
2,CNNW_20190528_060000_CNN_Newsroom_Live,307,120,180,prayer. citizens all across the country came t...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",CNN_Newsroom_Live,0
3,CNNW_20190528_060000_CNN_Newsroom_Live,308,180,240,symbolism and on ceremony. but somewhat short ...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",CNN_Newsroom_Live,0
4,CNNW_20190528_060000_CNN_Newsroom_Live,309,240,300,been violated by north korea's may 9th firing ...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",CNN_Newsroom_Live,0


In [121]:
cnn_df = cnn_df.set_index(['identifier', 'start_snip', 'end_snip'])
cnn_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,ad_cluster,news_cluster
identifier,start_snip,end_snip,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0
CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"that's simple, easy, awesome.",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0
CNNW_20190528_060000_CNN_Newsroom_Live,0,60,taxi!,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0
CNNW_20190528_060000_CNN_Newsroom_Live,0,60,should i have stopped her?,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0
CNNW_20190528_060000_CNN_Newsroom_Live,0,60,save hundreds of dollars a year when you get i...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0


In [123]:
cnn_snips = cnn_snips[['identifier', 'start_snip', 'end_snip', 'snip_ad']]

In [125]:
cnn_snips = cnn_snips.set_index(['identifier', 'start_snip', 'end_snip'])

In [126]:
cnn_df = cnn_df.join(cnn_snips, on=['identifier', 'start_snip', 'end_snip'], how='inner')

# 5. Apply LDA

Perhaps LDA percentages will be a feature than can predict ads.

In [158]:
X_corpus = list(clean_sent(cnn_df['sentence']))
X_corpus[:5]

['now the fastest most reliable internet can help you save on your wireless bill',
 'thats simple easy awesome',
 'taxi',
 'should i have stopped her',
 'save hundreds of dollars a year when you get internet and mobile together']

In [159]:
X = vect.fit_transform(X_corpus)

In [160]:
# Tweak the two parameters below
number_topics = 75
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_results = lda.fit_transform(X)

In [161]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [162]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, vect, number_words)

Topics found via LDA:

Topic #0:
im sorry cancer treatment treatment center center america good luck promise prove movement automatically automatically adjusts pain happens sens movement

Topic #1:
white house president trump did say glad help appointment available tested positive thats president just month positive coronavirus come man

Topic #2:
make sense hillary clinton good night christine roman make difference wa clear think right signal reach reach farther thats correct

Topic #3:
barack obama robert mueller youre looking really interesting irobot roombaTM middle class told u people achieved skin month achieved clearer

Topic #4:
joe biden wa wrong social medium going come did make bayer science surprisingly painless let say really need thats coming

Topic #5:
dont understand people say thats big say dont chief justice think democrat mar lago homeland security abnormal bleeding foreign leader

Topic #6:
make life life simple xfinity make poppy harlow thats need im poppy jerry na

let ask camera figure wall street saved life im tryin know people want play et cetera looking forward pd l

Topic #65:
health care help u care worker tax return dont wait going try president impeached said want john berman u asleep

Topic #66:
mitt romney capital group going talk know thats better choice wa talking bad thing jim acosta saving service people color

Topic #67:
wa easy youre right im saying award winning ha happened really hard mno kiddingrd daughter maria different thing red flag

Topic #68:
im sure dont worry doesnt mean want bring wa perfect need know kaitlan collins coronavirus case personal attorney going continue

Topic #69:
wa really super tuesday ha come whoo hoo hurricane dorian hals heart did come general election wash hand time u

Topic #70:
thats good chris cuomo thank coming good point didnt say great job talk u think real want start think did

Topic #71:
stay u oh god thats thing im doing thats im doesnt matter lindsey graham wa big puerto rico inside politi

In [164]:
cnn_df.head()

Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,ad_cluster,news_cluster,snip_ad
0,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0,0
1,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"that's simple, easy, awesome.",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0,0
2,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,taxi!,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0,0
3,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,should i have stopped her?,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0,0
4,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,save hundreds of dollars a year when you get i...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,0,0,0


In [166]:
lda_df = pd.DataFrame(lda_results)
lda_df.columns = ['topic_' + str(i) for i in range(75)]
cnn_df = cnn_df.join(lda_df, how='inner')
cnn_df.head()

Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,cluster,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,...,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667
1,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"that's simple, easy, awesome.",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,...,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.638587
2,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,taxi!,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
3,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,should i have stopped her?,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
4,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,save hundreds of dollars a year when you get i...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",1,...,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531


In [167]:
cnn_df.to_csv('../data/interim/cnn-sentences-clustered-topics.csv')

# 6. Add Features #

Now I will add calculated features to the dataframe to help diagnose blocks of ads, or words that may prefigure ads.

In [177]:
cnn_df = pd.read_csv('../data/interim/cnn-sentences-clustered-topics.csv')
len_df = len(cnn_df)

ad_next_words = ['back', 'return', 'ahead', 'go away', 'next', 'miss', 'after this', 'tuned', 'applause', 'appreciate']
ad_prev_words = ['back', 'welcome', 'talk', 'applause', 'good evening', 'good morning', 'appreciate']

cnn_df.head()

Unnamed: 0.1,Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,0,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667
1,1,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"that's simple, easy, awesome.",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.004884,0.638587
2,2,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,taxi!,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
3,3,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,should i have stopped her?,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
4,4,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,save hundreds of dollars a year when you get i...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531,0.005531


In [178]:
for i, row in cnn_df.iterrows():
    sentence = row['sentence']
    if (i < len_df - 1):
        for word in ad_next_words:
            if (word in sentence):
                cnn_df.loc[i + 1,'has_prev_' + word] = 1
                if (i + 2) < len_df:
                    cnn_df.loc[i + 2,'has_prev_' + word] = 1
                if (i + 3) < len_df:
                    cnn_df.loc[i + 2,'has_prev_' + word] = 1
    if (i > 0):
        for word in ad_prev_words:
            if (word in sentence):
                cnn_df.loc[i - 1, 'has_next_' + word] = 1
                if (i - 2) > 0:
                    cnn_df.loc[i - 2, 'has_next_' + word] = 1
                if (i - 3) > 0:
                    cnn_df.loc[i - 3, 'has_next_' + word] = 1


In [179]:
cnn_df = cnn_df.fillna(0)
cnn_df.head()

Unnamed: 0.1,Unnamed: 0,identifier,start_snip,end_snip,sentence,contributor,runtime,start_time,stop_time,subjects,...,has_prev_appreciate,has_next_appreciate,has_prev_ahead,has_prev_return,has_prev_after this,has_prev_go away,has_next_good evening,has_prev_applause,has_next_applause,has_prev_tuned
0,0,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"now the fastest, most reliable internet can he...",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,"that's simple, easy, awesome.",CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,taxi!,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,should i have stopped her?,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,CNNW_20190528_060000_CNN_Newsroom_Live,0,60,save hundreds of dollars a year when you get i...,CNNW,01:00:58,2019-05-28 06:00:00,2019-05-28 07:00:59,"['trump', 'north korea', 'japan', 'rosemary', ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
cnn_df.head(10000).to_csv('../data/interim/cnn_ready_to_code.csv')

In [181]:
cnn_df.iloc[10000:].to_csv('../data/interim/cnn_to_be_tested.csv')