In [30]:
%matplotlib inline
import numpy
import pysrt
import gensim
import textacy
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [124]:
# Load transcripts and preprocess the text
transcripts = []
transcript_single = []

for line in open('../data/test_video_list.txt'):
    srt_name = line.replace('tvnews', '../data').replace('mp4', 'cc5.srt')[:-1]
    offsets = []
    transcript = ''
    replace_list = ['\n', '>', '.', ',', '?', '!', '\'', '"', '-', '(', ')']
    #subs = pysrt.open(srt_name)
    subs = pysrt.open('../data/videos/MSNBC_20110830_080000_The_Rachel_Maddow_Show.cc5.srt')
    idx = 0
    for sub in subs:
        offsets.append((len(transcript), sub.start, sub.end))
        text = sub.text
        for token in replace_list:
            text = text.replace(token, ' ')
        transcript += text.upper() + ' '
        if idx >= 10:
            idx = 0;    
            transcript_single.append(transcript)
            transcript = ''
        else:
            idx += 1
    transcripts.append((None, transcript, offsets))
    break

In [125]:
# Extract PROPN and NOUN using textacy
# docs = [textacy.doc.Doc(transcript, lang=u'en', metadata={'offsets': offsets}) for (video, transcript, offsets) in transcripts]
# corpus = textacy.Corpus(u'en', docs=docs)

docs = [textacy.doc.Doc(transcript, lang=u'en') for transcript in transcript_single]
corpus = textacy.Corpus(u'en', docs=docs)

transcript_tokens = [
    list(textacy.extract.words(doc, filter_nums=True, include_pos=['PROPN', 'NOUN'])) 
    for doc in corpus.docs]


In [126]:
# Build dictionary and corpora using gensim
transcript_tokens = [[str(token) for token in tokens if len(str(token)) > 1] for tokens in transcript_tokens]
print(len(transcript_tokens))
dictionary = gensim.corpora.Dictionary(transcript_tokens)
print(dictionary)

corpus = [dictionary.doc2bow(tokens) for tokens in transcript_tokens]
#print(corpus)

2017-12-29 01:38:08,524 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-29 01:38:08,530 : INFO : built Dictionary(1046 unique tokens: ['BALANCE', 'CIVIL', 'PHRASE', 'FLOODWATERS', 'BREACH']...) from 111 documents (total 2350 corpus positions)


111
Dictionary(1046 unique tokens: ['BALANCE', 'CIVIL', 'PHRASE', 'FLOODWATERS', 'BREACH']...)


In [121]:
# Topic extraction
# step 1 -- initialize a model
tfidf = gensim.models.TfidfModel(corpus) 

2017-12-29 01:35:36,253 : INFO : collecting document frequencies
2017-12-29 01:35:36,254 : INFO : PROGRESS: processing document #0
2017-12-29 01:35:36,256 : INFO : calculating IDF weights for 1337 documents and 1031 features (2289 matrix non-zeros)


In [122]:
# step 2 -- apply the transformation to a whole corpus
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
#    print(doc)

In [123]:
# step 3 -- # initialize an LSI transformation
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
corpus_lsi = lsi[corpus_tfidf]

lsi.print_topics(100)

#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print(doc)

2017-12-29 01:35:39,456 : INFO : using serial LSI version on this node
2017-12-29 01:35:39,457 : INFO : updating model with new documents
2017-12-29 01:35:39,471 : INFO : preparing a new chunk of documents
2017-12-29 01:35:39,480 : INFO : using 100 extra samples and 2 power iterations
2017-12-29 01:35:39,481 : INFO : 1st phase: constructing (1032, 200) action matrix
2017-12-29 01:35:39,501 : INFO : orthonormalizing (1032, 200) action matrix
2017-12-29 01:35:39,540 : INFO : 2nd phase: running dense svd on (200, 1337) matrix
2017-12-29 01:35:39,592 : INFO : computing the final decomposition
2017-12-29 01:35:39,596 : INFO : keeping 100 factors (discarding 27.325% of energy spectrum)
2017-12-29 01:35:39,600 : INFO : processed documents up to #1337
2017-12-29 01:35:39,603 : INFO : topic #0(3.899): 0.971*"IT" + 0.080*"HE" + 0.077*"PEOPLE" + 0.076*"HAD" + 0.071*"LIE" + 0.064*"US" + 0.053*"TONIGHT" + 0.047*"WHO" + 0.045*"STORM" + 0.038*"SESSION"
2017-12-29 01:35:39,606 : INFO : topic #1(3.796)

2017-12-29 01:35:39,669 : INFO : topic #29(2.218): -0.721*"TIME" + -0.337*"STORM" + -0.309*"THING" + -0.228*"ME" + 0.171*"TONIGHT" + -0.154*"MEDICARE" + -0.136*"LOT" + 0.109*"VE" + -0.106*"YEARS" + 0.100*"COUNCIL"
2017-12-29 01:35:39,669 : INFO : topic #30(2.208): 0.543*"MEDICARE" + -0.495*"VE" + -0.229*"THINGS" + 0.225*"SUPPLEMENT" + -0.212*"TIME" + 0.186*"INSURANCE" + -0.175*"COUNCIL" + 0.162*"PLAN" + 0.153*"AARP" + -0.146*"CONGRESS"
2017-12-29 01:35:39,671 : INFO : topic #31(2.177): 0.619*"BUSH" + -0.423*"DON" + -0.387*"OBAMA" + -0.291*"IRAQ" + 0.134*"CHOICES" + -0.106*"COLIN" + -0.105*"POWELL" + 0.101*"WAR" + -0.101*"CHENEY" + 0.093*"RECOVERY"
2017-12-29 01:35:39,672 : INFO : topic #32(2.171): -0.875*"GOVERNOR" + -0.288*"BOOK" + -0.181*"TRAIL" + -0.147*"FACT" + 0.117*"VERMONT" + -0.102*"LIEUTENANT" + -0.098*"COLIN" + 0.081*"OBAMA" + -0.076*"HEADS" + -0.075*"CHRISTIE"
2017-12-29 01:35:39,673 : INFO : topic #33(2.131): 0.554*"LESSONS" + -0.368*"RECOVERY" + -0.317*"IRENE" + 0.310*"KAT

2017-12-29 01:35:39,702 : INFO : topic #67(1.776): 0.667*"SENATE" + 0.274*"NET" + 0.260*"SAFETY" + 0.248*"HOUSE" + -0.216*"WEEK" + -0.194*"SECURITY" + 0.169*"TODAY" + -0.134*"KIND" + 0.131*"SOCIAL" + -0.126*"POLICY"
2017-12-29 01:35:39,703 : INFO : topic #68(1.771): -0.785*"KIND" + -0.215*"PART" + 0.181*"ROLE" + -0.152*"FUNDING" + -0.150*"COVERAGE" + -0.145*"SUFFERING" + -0.144*"COMMENT" + 0.136*"CITIZENS" + 0.120*"WAY" + -0.117*"BILL"
2017-12-29 01:35:39,703 : INFO : topic #69(1.755): -0.692*"WEEK" + 0.329*"EVERYTHING" + 0.323*"JOBS" + 0.186*"ROLE" + -0.159*"SENATE" + -0.135*"CRASHES" + -0.134*"SAY" + -0.131*"VACATION" + 0.128*"ME" + 0.123*"AMERICAN"
2017-12-29 01:35:39,704 : INFO : topic #70(1.754): -0.585*"ROLE" + 0.534*"PART" + -0.228*"KIND" + 0.217*"EVERYTHING" + -0.132*"WEEK" + 0.124*"SCHEME" + 0.124*"PONZI" + -0.113*"PROVIDING" + -0.108*"SENATE" + 0.106*"PAKISTAN"
2017-12-29 01:35:39,705 : INFO : topic #71(1.753): -0.558*"PART" + -0.541*"ROLE" + 0.303*"EVERYTHING" + -0.227*"THIN

[(0,
  '0.971*"IT" + 0.080*"HE" + 0.077*"PEOPLE" + 0.076*"HAD" + 0.071*"LIE" + 0.064*"US" + 0.053*"TONIGHT" + 0.047*"WHO" + 0.045*"STORM" + 0.038*"SESSION"'),
 (1,
  '-0.970*"PEOPLE" + -0.143*"WHO" + 0.087*"IT" + -0.062*"VERMONT" + -0.056*"AMERICAN" + -0.043*"THINGS" + -0.041*"POLICY" + -0.037*"HEADS" + -0.037*"MELISSA" + -0.037*"ME"'),
 (2,
  '0.907*"PRESIDENT" + 0.215*"OBAMA" + 0.198*"BUSH" + 0.185*"WHAT" + 0.108*"VICE" + 0.096*"CHENEY" + 0.063*"PICK" + 0.062*"GOVERNMENT" + 0.059*"DICK" + 0.045*"ADMINISTRATION"'),
 (3,
  '-0.970*"GOVERNMENT" + -0.120*"AMERICANS" + -0.081*"WHO" + -0.078*"WHAT" + 0.077*"PRESIDENT" + -0.063*"TALIBAN" + -0.058*"THINGS" + -0.050*"SOMEONE" + -0.050*"PART" + -0.043*"DULL"'),
 (4,
  '-0.625*"RICK" + -0.621*"PERRY" + -0.319*"SECURITY" + -0.293*"SOCIAL" + -0.070*"FRONTRUNNER" + -0.066*"CODE" + -0.059*"WHAT" + -0.047*"FIX" + -0.043*"GOAL" + -0.043*"TRUTH"'),
 (5,
  '-0.936*"US" + -0.216*"TONIGHT" + -0.096*"WHO" + -0.094*"WHAT" + 0.078*"IT" + -0.063*"ME" + 0.058

In [112]:
# step 3 -- # initialize an Random Projection transformation
#rp = gensim.models.RpModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
#corpus_rp = rp[corpus_tfidf]

lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
corpus_lda = lda[corpus_tfidf]


# hdp = gensim.models.HdpModel(corpus_tfidf, id2word=dictionary) 
# corpus_hdp = hdp[corpus_tfidf]

lda.print_topics(100)

2017-12-29 01:27:52,220 : INFO : using symmetric alpha at 0.01
2017-12-29 01:27:52,221 : INFO : using symmetric eta at 0.01
2017-12-29 01:27:52,222 : INFO : using serial LDA version on this node
2017-12-29 01:27:54,785 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 1355 documents, updating model once every 1355 documents, evaluating perplexity every 1355 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-29 01:28:00,068 : INFO : -330.873 per-word bound, 4005588541707750426551806463285268661308893720092232091523026168775256042660939554551548881275977728.0 perplexity estimate based on a held-out corpus of 1355 documents with 1826 words
2017-12-29 01:28:00,069 : INFO : PROGRESS: pass 0, at document #1355/1355
2017-12-29 01:28:02,418 : INFO : topic #76 (0.010): 0.053*"TIME" + 0.036*"JUMP" + 0.027*"GAES" + 0.027*"WAYACK" + 0.025*"FEVEV" + 0.025*"DEVEVELOP" + 0.025*"SLEEPINLOFT" + 0.025*"SLEPT" + 0.024*"CONTENTS" + 

2017-12-29 01:28:02,448 : INFO : topic #28 (0.010): 0.048*"TODAY" + 0.019*"HAHAPPE" + 0.019*"MOCKERY" + 0.019*"BEE" + 0.019*"BECAUSWHAT" + 0.019*"US" + 0.018*"TE" + 0.017*"CHILDREN" + 0.016*"EVYONE" + 0.016*"HAVEN"
2017-12-29 01:28:02,449 : INFO : topic #29 (0.010): 0.034*"ING" + 0.022*"UNDS" + 0.022*"SMILE" + 0.022*"TESTING" + 0.022*"HISTORY" + 0.019*"JAJACKSON" + 0.018*"IMIMRTANT" + 0.017*"SOYOU" + 0.016*"LDLD" + 0.016*"ASTAPLES"
2017-12-29 01:28:02,450 : INFO : topic #30 (0.010): 0.045*"US" + 0.045*"FAMILY" + 0.034*"DON" + 0.021*"STRIC" + 0.021*"ERE" + 0.021*"ALREA" + 0.020*"GOTOUOUNS" + 0.020*"HARMONY" + 0.020*"JASON" + 0.019*"DEATH"
2017-12-29 01:28:02,451 : INFO : topic #31 (0.010): 0.058*"FAMILY" + 0.029*"MAN" + 0.023*"SONG" + 0.022*"POSSIBLE" + 0.022*"DON" + 0.022*"GUDIDIANIP" + 0.022*"FRND" + 0.021*"MOTHERER" + 0.021*"CAUSED" + 0.021*"ARTETE"
2017-12-29 01:28:02,451 : INFO : topic #32 (0.010): 0.028*"TOURNSIDE" + 0.025*"AE" + 0.025*"AMICS" + 0.025*"AFRICAN" + 0.021*"JACKN" + 0

2017-12-29 01:28:02,479 : INFO : topic #66 (0.010): 0.037*"CHILDREN" + 0.025*"CAT" + 0.025*"RELEASESED" + 0.022*"TONIGHT" + 0.021*"MICHAEL" + 0.018*"REHEARSAL" + 0.018*"TUESDAY" + 0.018*"SPRINT" + 0.018*"WHWHEN" + 0.018*"RERONG"
2017-12-29 01:28:02,480 : INFO : topic #67 (0.010): 0.026*"BROTHER" + 0.023*"MICHAELL" + 0.022*"THING" + 0.021*"ANDND" + 0.021*"LEGACY" + 0.019*"WT" + 0.019*"EVERYBY" + 0.019*"THATT" + 0.018*"TOO" + 0.017*"CHILDREN"
2017-12-29 01:28:02,480 : INFO : topic #68 (0.010): 0.038*"CAT" + 0.032*"TAKEN" + 0.025*"PROCLAIMED" + 0.025*"HOSPITAL" + 0.023*"IT" + 0.022*"ANYTHINGN" + 0.021*"LURE" + 0.018*"ANE" + 0.018*"PRESCRIPTION" + 0.016*"HOUSE"
2017-12-29 01:28:02,482 : INFO : topic #69 (0.010): 0.023*"TETELL" + 0.023*"NERLAND" + 0.019*"LATERER" + 0.018*"TRIAL" + 0.017*"UNUAL" + 0.016*"JACKSON" + 0.016*"TREAD" + 0.016*"DRS" + 0.016*"WEN" + 0.016*"POIBIBILY"
2017-12-29 01:28:02,482 : INFO : topic #70 (0.010): 0.033*"MOTHER" + 0.032*"HOSPITAL" + 0.032*"MILLI" + 0.032*"WESAYS

[(0,
  '0.030*"FOYEYER" + 0.030*"ACCUISS" + 0.024*"PRESCRIP" + 0.022*"MEAN" + 0.022*"HEEARING" + 0.022*"ISROBABLY" + 0.020*"BUIT" + 0.020*"WASYESTERDAY" + 0.019*"DRUGS" + 0.018*"ININSOFAR"'),
 (1,
  '0.022*"MUSIC" + 0.021*"WHAT" + 0.021*"TNGNG" + 0.021*"CHAI" + 0.021*"THIN" + 0.021*"ADMINISTRATION" + 0.017*"STEP" + 0.017*"MENTIONED" + 0.016*"JACKSON5" + 0.016*"CATALOG"'),
 (2,
  '0.024*"PUIC" + 0.024*"NEVERLAND" + 0.024*"JACKSONLOOKED" + 0.024*"ORLOOKED" + 0.021*"HAPPINESS" + 0.019*"MAK" + 0.019*"CUSTODY" + 0.018*"FOLKS" + 0.018*"ITREAL" + 0.017*"OLE"'),
 (3,
  '0.055*"MICHAEL" + 0.040*"JACKSON" + 0.026*"THIHIS" + 0.026*"TOY" + 0.026*"SONGS" + 0.026*"IMAGINETHTHAT" + 0.024*"MOMOM" + 0.021*"WHO" + 0.021*"HAPP" + 0.017*"STREET"'),
 (4,
  '0.033*"PLACE" + 0.026*"VERLAN" + 0.021*"WAS" + 0.020*"KNO" + 0.020*"NEVERLAND" + 0.020*"HEAS" + 0.019*"ANDROLLINGSTONE" + 0.019*"RELEASAS" + 0.019*"STEPS" + 0.019*"DENITELY"'),
 (5,
  '0.026*"AREA" + 0.022*"FRNDS" + 0.022*"OH" + 0.022*"TRIAL" + 0.022*"N