# Coherence Testing From Scratch

In [59]:
from __future__ import print_function
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import gensim
import pandas as pd
import numpy as np
import sys
import re
import itertools
import time
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from nltk.corpus import wordnet

In [60]:
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'],
         ['user', 'response', 'time'],
         ['trees'],
         ['graph', 'trees'],
         ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [61]:
# make LDa model
goodLdaModel = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)
badLdaModel = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2)



In [11]:
# U_mass Coherence
good_um = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
bad_um = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')

In [12]:
# C_V Coherence
good_cv = CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
bad_cv = CoherenceModel(model=badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')

In [16]:
print "higher score is better?"
print "good model umass: ", good_um.get_coherence()
print "bad model umass: ", bad_um.get_coherence()
print "good model cv: ", good_cv.get_coherence()
print "bad model cv: ", bad_cv.get_coherence()

higher score is better?
good model umass:  -14.3893687367
bad model umass:  -14.7260284905
good model cv:  0.549597123985
bad model cv:  0.559736419012


# Coherence Testing From Saved Corpus

In [2]:
import pickle
import gensim
import os
import pylab as plt
import numpy as np
import re
import pandas as pd
import seaborn as sb



In [3]:
_MODELS_DIR = "saved_models/"
key="Y02E_10_20"
m1 ="DTM_model"
m2 ="DIM_model"
dict_file = "{}.dict".format(key)
corpus_file = "{}.mm".format(key)

In [6]:
import inspect
inspect.getargspec(CoherenceModel.__init__)

ArgSpec(args=['self', 'model', 'texts', 'corpus', 'dictionary', 'coherence'], varargs=None, keywords=None, defaults=(None, None, None, 'c_v'))

In [5]:
# get corpus
corpus = gensim.corpora.MmCorpus(os.path.join(_MODELS_DIR, corpus_file))

# get dictionary
dictionary = gensim.corpora.Dictionary.load(os.path.join(_MODELS_DIR, dict_file))

# get DTM model
filehandler = open(_MODELS_DIR + m1 + ".obj",'r')
DTM = pickle.load(filehandler)
filehandler.close()

In [None]:
# NEED TO GET TEXTS FROM SAVED CORPUS! 
# but it's serialized so you'll have to undo the serialization

In [None]:
start = datetime.now()
c_v = []
for n, topic in enumerate(topics):
    print n  # for personal monitoring purposes. sorry for this
    try:
        cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
    except KeyError:
        pass
end = datetime.now()
print "Time taken: %s" % (end - start)

# Coherence Testing From Abstracts

In [7]:
from DTM_Pipeline import Pipeline

# read in data
# make corpus, texts etc.
pl = Pipeline(key="Y02E_10_20",m_type="DIM",num_topics=10)
pl.make_corpus()

Making dictionary
408.761168003
saving dictionary
0.0442712306976
Saving corpus
177.700211048


In [48]:
def stream_corp(path,approved_ids=None):
    '''generator for iterating through lines of the data
    '''
    # regex for sanitizing the abstracts
    html = re.compile(r'\<[^\>]*\>')
    nonan = re.compile(r'[^a-zA-Z ]')

    # read file one line at a time and sanitize
    for line in pd.read_csv(path,sep=',', chunksize=1):
        if approved_ids == None:
            line = line["appln_abstract"].values[0]
            line = nonan.sub(' ',html.sub('',str(line))).lower().split()
            line = stem_doc(line) #apply lemmatization/stemming
            yield line
        else: # if there are approved_ids
            if line["appln_id"].values[0] in approved_ids:
                line = line["appln_abstract"].values[0]
                line = nonan.sub(' ',html.sub('',str(line))).lower().split()
                line = stem_doc(line) #apply lemmatization/stemming
                yield line

def get_time_seq(data_file, min_slice_size=None):
    df =  pd.read_csv(data_file)
    # Create dummy column
    df["Y"] = pd.DatetimeIndex(df["appln_filing_date"]).to_period("A")
    # group by dummy column
    groups = df.groupby("Y")
    # return sorted df and counts dict
    #df = df.sort_values("appln_filing_date")
    approved_ids = None
    if min_slice_size == None:
        # count members of each group
        counts = np.sort([[key,len(groups.groups[key])] for key in groups.groups.keys()], axis=0)
        time_seq = list(counts[:,1])
    else:
        approved_ids = []
        for group in groups.groups.iteritems():
            if len(group[1]) >= min_slice_size:
                approved_ids.append(df.loc[group[1]]["appln_id"].values[:min_slice_size])
        time_seq = [min_slice_size]*len(approved_ids)
    return time_seq, approved_ids  

def get_wordnet_pos(treebank_tag):
    tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)

def stem_doc(doc):
    lmtzr = WordNetLemmatizer()
    tags = nltk.pos_tag(doc)
    return [lmtzr.lemmatize(word, get_wordnet_pos(tag[1])) for word, tag in zip(doc, tags)]

def progress(percent):
    '''Just a progress bar to be printed to the screen
    '''
    width = 10
    perc = np.floor(percent*width)
    prog = "="*perc + ">"
    if percent != 1:
        print('\r' + 'Progress: [{}] {}%'.format(prog.ljust(width+1), str(percent*100)),end="")
    else:
        print('\r' + 'Progress: [{}] {}%'.format(prog.ljust(width+1), str(percent*100)),end="")
    sys.stdout.flush()

In [58]:
key="Y02E_10_20"
data_file = '../Data/{}.csv'.format(key)
min_slice_size=200
time_seq, approved_ids = get_time_seq(data_file, min_slice_size)

In [9]:
t0 = time.time()
pl.corpus.get_texts()
print(time.time() - t0)

191.520123005


In [22]:
ivd = {v: k for k, v in dictionary.token2id.iteritems()}

u'limited'

In [24]:
doc = pl.corpus.docs[0]

In [35]:
texts = []
for doc in pl.corpus.docs: 
    text = list(itertools.chain(*[[ivd[word[0]]]*word[1] for word in doc]))
    texts.append(text)
len(texts)

6400

In [44]:
# create training/validation/testing splits.
train = pl.corpus.docs[:400]  # [:4800]
val = pl.corpus.docs[400:600] # [4800:6000]
test = pl.corpus.docs[600:800] # [6000:]

train_ts = [200,200]
val_ts = [200]
test_ts = [200]

train_texts = texts[:400]
val_texts = texts[400:600]
test_texts = texts[600:800]

In [86]:
stoplist = set(nltk.corpus.stopwords.words("english"))
train_dict = make_dict(train, stoplist=stoplist)
val_dict = make_dict(val, stoplist=stoplist)
test_dict = make_dict(test, stoplist=stoplist)

In [39]:
nt = 10
# Static LDA
lda = gensim.models.LdaModel(train, id2word=pl.corpus.dictionary, num_topics=nt)



In [43]:
dtm_home = os.environ.get('DTM_HOME', "dtm-master")
dtm_path = os.path.join(dtm_home, 'bin', 'dtm-darwin64') if dtm_home else None

# DTM
dtm = DtmModel(dtm_path,train,train_ts,num_topics=nt,id2word=pl.corpus.dictionary,initialize_lda=True)

In [45]:
# DIM
dim = DtmModel(dtm_path,train,train_ts,num_topics=nt,model="fixed",id2word=dictionary,initialize_lda=True)

In [2]:
# test coherence

In [52]:
# U_mass Coherence
lda_um = CoherenceModel(model=lda, corpus=train, dictionary=pl.corpus.dictionary, coherence='u_mass')
dtm_um = CoherenceModel(model=dtm, corpus=train, dictionary=pl.corpus.dictionary, coherence='u_mass')
dim_um = CoherenceModel(model=dim, corpus=train, dictionary=pl.corpus.dictionary, coherence='u_mass')

lda_cv = CoherenceModel(model=lda, texts=train_texts, dictionary=pl.corpus.dictionary, coherence='c_v')
dtm_cv = CoherenceModel(model=dtm, texts=train_texts, dictionary=pl.corpus.dictionary, coherence='c_v')
dim_cv = CoherenceModel(model=dim, texts=train_texts, dictionary=pl.corpus.dictionary, coherence='c_v')

In [49]:
print(lda_um.get_coherence())
print(dtm_um.get_coherence())
print(dim_um.get_coherence())

-1.44316792646
nan
nan




In [54]:
#print(lda_cv.get_coherence())
#print(dtm_um.get_coherence())
#print(dim_um.get_coherence())

nan


In [55]:
# get corpus
corpus = gensim.corpora.MmCorpus(os.path.join(_MODELS_DIR, corpus_file))
list(corpus)

[[(28, 2.0),
  (131, 1.0),
  (141, 1.0),
  (155, 1.0),
  (185, 1.0),
  (209, 1.0),
  (248, 1.0),
  (330, 1.0),
  (415, 1.0),
  (485, 1.0),
  (502, 1.0),
  (521, 1.0),
  (583, 1.0),
  (632, 1.0),
  (691, 2.0),
  (726, 2.0),
  (762, 2.0),
  (817, 1.0),
  (951, 1.0),
  (997, 1.0),
  (1031, 1.0),
  (1163, 1.0),
  (1199, 1.0),
  (1319, 1.0),
  (1328, 3.0),
  (1408, 1.0),
  (1499, 1.0),
  (1613, 1.0),
  (1618, 2.0),
  (1726, 1.0),
  (1780, 1.0),
  (1872, 9.0),
  (1966, 1.0),
  (2033, 1.0),
  (2065, 1.0),
  (2171, 5.0),
  (2173, 2.0),
  (2197, 1.0),
  (2203, 2.0),
  (2215, 1.0),
  (2265, 3.0),
  (2401, 3.0),
  (2415, 1.0),
  (2504, 2.0),
  (2645, 1.0),
  (2703, 5.0),
  (2820, 9.0)],
 [(36, 1.0),
  (55, 1.0),
  (74, 6.0),
  (141, 2.0),
  (165, 1.0),
  (212, 2.0),
  (221, 6.0),
  (283, 1.0),
  (330, 1.0),
  (350, 1.0),
  (358, 3.0),
  (437, 1.0),
  (460, 1.0),
  (521, 1.0),
  (631, 1.0),
  (761, 1.0),
  (762, 1.0),
  (779, 1.0),
  (837, 1.0),
  (868, 1.0),
  (946, 1.0),
  (970, 1.0),
  (1077, 6

In [57]:
texts = []
for doc in corpus: 
    text = list(itertools.chain(*[[ivd[word[0]]]*int(word[1]) for word in doc]))
    texts.append(text)
len(texts)

6400