In [30]:
%matplotlib inline
import numpy
import pysrt
import gensim
import textacy
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [127]:
# Load transcripts and preprocess the text
transcripts = []
transcript_single = []

for line in open('../data/test_video_list.txt'):
    srt_name = line.replace('tvnews', '../data').replace('mp4', 'cc5.srt')[:-1]
    offsets = []
    transcript = ''
    replace_list = ['\n', '>', '.', ',', '?', '!', '\'', '"', '-', '(', ')']
    #subs = pysrt.open(srt_name)
    subs = pysrt.open('../data/videos/MSNBC_20110830_080000_The_Rachel_Maddow_Show.cc5.srt')
    idx = 0
    for sub in subs:
        offsets.append((len(transcript), sub.start, sub.end))
        text = sub.text
        for token in replace_list:
            text = text.replace(token, ' ')
        transcript += text.upper() + ' '
        if idx >= 10:
            idx = 0;    
            transcript_single.append(transcript)
            transcript = ''
        else:
            idx += 1
    transcripts.append((None, transcript, offsets))
    break

In [128]:
# Extract PROPN and NOUN using textacy
# docs = [textacy.doc.Doc(transcript, lang=u'en', metadata={'offsets': offsets}) for (video, transcript, offsets) in transcripts]
# corpus = textacy.Corpus(u'en', docs=docs)

docs = [textacy.doc.Doc(transcript, lang=u'en') for transcript in transcript_single]
corpus = textacy.Corpus(u'en', docs=docs)

transcript_tokens = [
    list(textacy.extract.words(doc, filter_nums=True, include_pos=['PROPN', 'NOUN'])) 
    for doc in corpus.docs]


In [129]:
# Build dictionary and corpora using gensim
transcript_tokens = [[str(token) for token in tokens if len(str(token)) > 1] for tokens in transcript_tokens]
print(len(transcript_tokens))
dictionary = gensim.corpora.Dictionary(transcript_tokens)
print(dictionary)

corpus = [dictionary.doc2bow(tokens) for tokens in transcript_tokens]
#print(corpus)

2017-12-29 01:39:50,098 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-29 01:39:50,105 : INFO : built Dictionary(1042 unique tokens: ['BALANCE', 'CIVIL', 'PHRASE', 'FLOODWATERS', 'BREACH']...) from 121 documents (total 2351 corpus positions)


121
Dictionary(1042 unique tokens: ['BALANCE', 'CIVIL', 'PHRASE', 'FLOODWATERS', 'BREACH']...)


In [130]:
# Topic extraction
# step 1 -- initialize a model
tfidf = gensim.models.TfidfModel(corpus) 

2017-12-29 01:40:36,469 : INFO : collecting document frequencies
2017-12-29 01:40:36,470 : INFO : PROGRESS: processing document #0
2017-12-29 01:40:36,472 : INFO : calculating IDF weights for 121 documents and 1041 features (2101 matrix non-zeros)


In [131]:
# step 2 -- apply the transformation to a whole corpus
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
#    print(doc)

In [134]:
# step 3 -- # initialize an LSI transformation
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20) 
corpus_lsi = lsi[corpus_tfidf]

lsi.print_topics(20)

#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print(doc)

2017-12-29 01:47:27,227 : INFO : using serial LSI version on this node
2017-12-29 01:47:27,228 : INFO : updating model with new documents
2017-12-29 01:47:27,232 : INFO : preparing a new chunk of documents
2017-12-29 01:47:27,234 : INFO : using 100 extra samples and 2 power iterations
2017-12-29 01:47:27,235 : INFO : 1st phase: constructing (1042, 120) action matrix
2017-12-29 01:47:27,237 : INFO : orthonormalizing (1042, 120) action matrix
2017-12-29 01:47:27,251 : INFO : 2nd phase: running dense svd on (120, 121) matrix
2017-12-29 01:47:27,262 : INFO : computing the final decomposition
2017-12-29 01:47:27,265 : INFO : keeping 20 factors (discarding 70.728% of energy spectrum)
2017-12-29 01:47:27,268 : INFO : processed documents up to #121
2017-12-29 01:47:27,271 : INFO : topic #0(1.845): 0.266*"SOCIAL" + 0.237*"RICK" + 0.233*"SECURITY" + 0.230*"PERRY" + 0.205*"IT" + 0.187*"LIE" + 0.154*"PEOPLE" + 0.151*"GENERATION" + 0.139*"HE" + 0.125*"CAMPAIGN"
2017-12-29 01:47:27,273 : INFO : topi

[(0,
  '0.266*"SOCIAL" + 0.237*"RICK" + 0.233*"SECURITY" + 0.230*"PERRY" + 0.205*"IT" + 0.187*"LIE" + 0.154*"PEOPLE" + 0.151*"GENERATION" + 0.139*"HE" + 0.125*"CAMPAIGN"'),
 (1,
  '-0.276*"SOCIAL" + -0.232*"SECURITY" + -0.213*"LIE" + -0.200*"RICK" + -0.181*"PERRY" + -0.162*"GENERATION" + 0.135*"STATE" + 0.130*"PRESIDENT" + 0.127*"ADMINISTRATION" + -0.118*"TRAIL"'),
 (2,
  '0.295*"STATE" + -0.225*"BUSH" + -0.166*"CHENEY" + -0.161*"PRESIDENT" + 0.155*"STORM" + -0.143*"ADMINISTRATION" + 0.142*"VERMONT" + 0.139*"IRENE" + -0.133*"HE" + 0.127*"COAST"'),
 (3,
  '0.426*"MEDICARE" + 0.219*"INSURANCE" + 0.195*"AL" + 0.183*"QAEDA" + 0.175*"TALIBAN" + 0.173*"SUPPLEMENT" + -0.148*"BUDGET" + 0.143*"AARP" + 0.139*"PLAN" + -0.138*"STATE"'),
 (4,
  '-0.392*"BUDGET" + -0.276*"TEXAS" + -0.261*"MEDICARE" + -0.170*"HE" + -0.126*"INSURANCE" + -0.122*"TAX" + 0.118*"BUSH" + 0.118*"LIE" + -0.116*"SHORTFALL" + -0.116*"CONSTITUTION"'),
 (5,
  '-0.336*"MEDICARE" + 0.278*"TALIBAN" + 0.265*"AL" + 0.249*"QAEDA" + 0.

In [135]:
# step 3 -- # initialize an Random Projection transformation
#rp = gensim.models.RpModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
#corpus_rp = rp[corpus_tfidf]

lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=20) 
corpus_lda = lda[corpus_tfidf]


# hdp = gensim.models.HdpModel(corpus_tfidf, id2word=dictionary) 
# corpus_hdp = hdp[corpus_tfidf]

lda.print_topics(20)

2017-12-29 02:00:00,036 : INFO : using symmetric alpha at 0.05
2017-12-29 02:00:00,037 : INFO : using symmetric eta at 0.05
2017-12-29 02:00:00,038 : INFO : using serial LDA version on this node
2017-12-29 02:00:00,314 : INFO : running online (single-pass) LDA training, 20 topics, 1 passes over the supplied corpus of 121 documents, updating model once every 121 documents, evaluating perplexity every 121 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-29 02:00:00,832 : INFO : -81.421 per-word bound, 3238220728713074995888128.0 perplexity estimate based on a held-out corpus of 121 documents with 470 words
2017-12-29 02:00:00,833 : INFO : PROGRESS: pass 0, at document #121/121
2017-12-29 02:00:00,952 : INFO : topic #17 (0.050): 0.010*"REPUBLICAN" + 0.007*"BIPARTISANSHIP" + 0.007*"RECORD" + 0.007*"SEAMS" + 0.006*"CANDIDATES" + 0.006*"PARTY" + 0.006*"MEMOIR" + 0.005*"PRESIDENTIAL" + 0.004*"SOMETHING" + 0.004*"BUSH"
2017-12-29 02:00:00,953 : INFO : topic #5 (0.050):

[(0,
  '0.008*"MONEY" + 0.008*"VE" + 0.007*"DONE" + 0.007*"SORRY" + 0.007*"GULF" + 0.007*"TALIBAN" + 0.007*"REPORTER" + 0.006*"AMERICANS" + 0.006*"WATER" + 0.006*"HELPING"'),
 (1,
  '0.009*"ECONOMIST" + 0.008*"MEDICARE" + 0.008*"WHO" + 0.006*"BUSH" + 0.006*"THINKER" + 0.006*"POSITION" + 0.006*"WHOSE" + 0.005*"ANNIVERSARY" + 0.005*"REPUBLICANS" + 0.005*"WORD"'),
 (2,
  '0.009*"GIFT" + 0.007*"INCREDIBLY" + 0.007*"NEW" + 0.006*"LL" + 0.006*"TONIGHT" + 0.005*"DON" + 0.005*"HE" + 0.005*"RICH" + 0.005*"TAXES" + 0.005*"WEALTH"'),
 (3,
  '0.010*"GOOLSBEE" + 0.008*"TERROR" + 0.005*"WATERBOARDING" + 0.005*"INTERROGATION" + 0.005*"STOUTEST" + 0.005*"ENHANCED" + 0.005*"VIEW" + 0.005*"SUSPECTS" + 0.005*"WRITTEN" + 0.005*"PHRASE"'),
 (4,
  '0.008*"DISASTER" + 0.007*"STATE" + 0.006*"DULL" + 0.006*"RELIEF" + 0.006*"WAR" + 0.005*"BIN" + 0.005*"WHAT" + 0.005*"NEW" + 0.005*"EMERGENCY" + 0.005*"TIME"'),
 (5,
  '0.010*"VICTORY" + 0.009*"WATERBURY" + 0.008*"ADMINISTRATION" + 0.006*"ME" + 0.006*"WHAT" + 0.00