In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
import sys
import gensim

sys.path.append(os.getcwd())
from utility import *

In [61]:
import smart_open
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english')

In [3]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="utf-8") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line, min_len=3)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
train_corpus = list(read_corpus("/Volumes/White/training/bbchealth.txt"))

In [6]:
test_corpus = list(read_corpus("/Volumes/White/training/cbchealth.txt", tokens_only=True))

In [117]:
model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [118]:
model2.build_vocab(train_corpus)

In [120]:
model2.train(train_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)


In [122]:
model2.vocabulary.

<gensim.models.doc2vec.Doc2VecVocab at 0x134f44f28>

In [123]:
for word in model2.vocabulary:
    print(vocabulary)

TypeError: 'Doc2VecVocab' object is not iterable

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [13]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 299, 1: 101, 2: 79, 3: 73, 6: 52, 5: 49, 4: 47, 7: 44, 8: 43, 10: 42, 9: 40, 11: 37, 17: 37, 14: 37, 13: 33, 27: 33, 21: 32, 19: 31, 20: 30, 12: 30, 26: 29, 47: 29, 18: 28, 24: 28, 15: 27, 29: 26, 23: 26, 40: 24, 41: 24, 22: 24, 32: 23, 48: 23, 25: 22, 33: 22, 43: 22, 79: 21, 16: 21, 28: 21, 54: 20, 42: 20, 30: 19, 31: 18, 78: 18, 35: 18, 38: 18, 55: 17, 34: 17, 36: 17, 60: 17, 50: 17, 84: 17, 94: 17, 68: 17, 45: 17, 90: 16, 88: 16, 70: 16, 100: 16, 66: 16, 52: 15, 37: 15, 106: 15, 73: 15, 138: 15, 61: 15, 46: 15, 82: 14, 58: 14, 101: 14, 51: 14, 80: 14, 71: 14, 111: 14, 39: 14, 91: 14, 115: 14, 112: 14, 74: 13, 107: 13, 76: 13, 93: 13, 75: 13, 110: 13, 127: 13, 67: 13, 179: 13, 81: 13, 92: 13, 63: 13, 118: 13, 59: 13, 65: 13, 44: 12, 89: 12, 166: 12, 133: 12, 139: 12, 124: 12, 121: 12, 120: 12, 102: 12, 56: 12, 53: 12, 159: 11, 131: 11, 155: 11, 69: 11, 174: 11, 119: 11, 98: 11, 62: 11, 104: 11, 128: 11, 77: 11, 87: 11, 156: 11, 147: 10, 64: 10, 132: 10, 130: 10, 99: 10, 1

In [18]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (3928): «diff manslaughter inquiry call»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1913, 0.9228657484054565): «hospital deaths inquiry announced»

SECOND-MOST (3245, 0.9043088555335999): «trust apologises over dna failings»

MEDIAN (135, 0.8058278560638428): «testosterone boost could cut deaths»

LEAST (1849, -0.890856146812439): «audio uk mum spent to have girl»



In [19]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (18): «public back tax rises to fund nhs»

Similar Document (532, 0.7212491631507874): «video could volunteers help crisis»



In [42]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

AttributeError: 'LdaModel' object has no attribute 'infer_vector'

In [25]:
model.vocabulary

<gensim.models.doc2vec.Doc2VecVocab at 0x12a0f0390>

In [27]:
len(test_corpus)

3741

In [65]:
docs = [[token for token in doc if not token.isnumeric()] for doc in test_corpus]

In [66]:
docs = [[token for token in doc if token not in STOP_WORDS and len(token) > 1] for doc in docs]

In [67]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [68]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [69]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.3)

In [70]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [71]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 547
Number of documents: 3741


In [72]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 50
iterations = 500
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [76]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -7.6332.
[([(0.03730305, 'study'),
   (0.027884427, 'health'),
   (0.02675366, 'child'),
   (0.02642206, 'risk'),
   (0.022995073, 'may'),
   (0.022529446, 'say'),
   (0.021927454, 'flu'),
   (0.020281803, 'canada'),
   (0.019442543, 'find'),
   (0.018828705, 'heart')],
  -3.898933885195516),
 ([(0.05046783, 'health'),
   (0.047932595, 'patient'),
   (0.033811655, 'care'),
   (0.027102731, 'hospital'),
   (0.026161684, 'say'),
   (0.025588393, 'help'),
   (0.020499807, 'woman'),
   (0.018278828, 'canada'),
   (0.016977787, 'cancer'),
   (0.016194953, 'get')],
  -4.29935486905076),
 ([(0.11225357, 'ebola'),
   (0.04032292, 'doctor'),
   (0.039274562, 'outbreak'),
   (0.028245462, 'say'),
   (0.022619862, 'ebola_outbreak'),
   (0.02203843, 'health'),
   (0.01969455, 'vaccine'),
   (0.0174284, 'mental'),
   (0.014844919, 'virus'),
   (0.014119089, 'case')],
  -5.8833361757886085),
 ([(0.021932602, 'life'),
   (0.021549134, 'doctor'),
   (0.019754348, 'say'),
   (0

In [80]:
top_topics[0][0][0][1]

'study'

In [86]:
for data in top_topics:
    words, coh = data
    for word in words:
        print(word[1],end=' ')
    print('%.2f.' % coh)

study health child risk may say flu canada find heart -3.90.
health patient care hospital say help woman canada cancer get -4.30.
ebola doctor outbreak say ebola_outbreak health vaccine mental virus case -5.88.
life doctor say quebec recalled concussion could teen rule safety -11.81.
cancer medical case food measles kid marijuana school man death -12.28.


In [43]:
help(LdaModel.top_topics)

Help on function top_topics in module gensim.models.ldamodel:

top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1)
    Get the topics with the highest coherence score the coherence for each topic.
    
    Parameters
    ----------
    corpus : iterable of list of (int, float), optional
        Corpus in BoW format.
    texts : list of list of str, optional
        Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
        probability estimator .
    dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
        Gensim dictionary mapping of id word to create corpus.
        If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
    window_size : int, optional
        Is the size of the window to be used for coherence measures using boolean sliding window as their
        probability estimator. For '

In [89]:
result = model.inference(corpus)

In [92]:
pprint(result[0])

array([[4.202327  , 0.14931256, 0.1811558 , 0.17285907, 0.13248737],
       [0.20737213, 0.14881411, 0.17966826, 0.17013313, 1.1322451 ],
       [0.21157056, 0.14905141, 0.18271449, 3.1623259 , 0.13253179],
       ...,
       [1.2044965 , 0.14944501, 0.18024164, 0.17137441, 0.13267583],
       [0.20907094, 0.14918493, 0.18008256, 1.1696776 , 2.1300695 ],
       [0.21486135, 0.1490978 , 0.1811795 , 2.161536  , 2.1314466 ]],
      dtype=float32)


In [96]:
result = result[0]

In [103]:
np.where(result==np.max(result,axis=1))

  """Entry point for launching an IPython kernel.


(array([], dtype=int64),)

In [102]:
np.max(result,axis=1)

array([4.202327 , 1.1322451, 3.1623259, ..., 1.2044965, 2.1300695,
       2.161536 ], dtype=float32)

In [109]:
model.log_perplexity(corpus)

-6.107740176398475

In [112]:
for i in range(num_topics):
    for x in model.show_topic(i):
        print(x[0], end=" ")
    print()

study health child risk may say flu canada find heart 
cancer medical case food measles kid marijuana school man death 
ebola doctor outbreak say ebola_outbreak health vaccine mental virus case 
health patient care hospital say help woman canada cancer get 
life doctor say quebec recalled concussion could teen rule safety 
