In [1]:
import string
import re
import os
import tempfile
import logging
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
import gensim.parsing.preprocessing as processing
numberOfTopics = 125



In [2]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [27]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
stopWords.remove('')
processing.STOPWORDS = stopWords
def removeStops(text):
    return processing.remove_stopwords(text.lower().translate(None, string.punctuation))
abstracts = [[art[0],datetime.strptime(art[4][:-1], '%Y-%m-%d'),removeStops((art[1]+art[2]+art[3])) ] for art in entries]
abstracts.sort(key=lambda q: q[1].year)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for abst in abstracts:
    for token in abst[2].split(" "):
        frequency[token] += 1
processedCorpus = [[token for token in abst[2].split(" ") if frequency[token] > 5] for abst in abstracts]

In [96]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

2018-05-25 15:31:25,062 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-05-25 15:31:25,250 : INFO : built Dictionary(3860 unique tokens: [u'limited', u'represent', u'believed', u'alleles', u'copy']...) from 1044 documents (total 148749 corpus positions)
2018-05-25 15:31:25,252 : INFO : saving Dictionary object under d:\temp\words.dict, separately None
2018-05-25 15:31:25,256 : INFO : saved d:\temp\words.dict


In [97]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

2018-05-25 15:31:25,430 : INFO : storing corpus in Matrix Market format to d:\temp\words.dict
2018-05-25 15:31:25,433 : INFO : saving sparse matrix to d:\temp\words.dict
2018-05-25 15:31:25,434 : INFO : PROGRESS: saving document #0
2018-05-25 15:31:25,868 : INFO : PROGRESS: saving document #1000
2018-05-25 15:31:25,888 : INFO : saved 1044x3860 matrix, density=2.473% (99651/4029840)
2018-05-25 15:31:25,888 : INFO : saving MmCorpus index to d:\temp\words.dict.index


The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [98]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics)

2018-05-25 15:31:25,903 : INFO : using symmetric alpha at 0.008
2018-05-25 15:31:25,904 : INFO : using symmetric eta at 0.008
2018-05-25 15:31:25,904 : INFO : using serial LDA version on this node
2018-05-25 15:31:25,960 : INFO : running online (single-pass) LDA training, 125 topics, 1 passes over the supplied corpus of 1044 documents, updating model once every 1044 documents, evaluating perplexity every 1044 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-25 15:31:28,707 : INFO : -19.735 per-word bound, 872495.8 perplexity estimate based on a held-out corpus of 1044 documents with 148749 words
2018-05-25 15:31:28,709 : INFO : PROGRESS: pass 0, at document #1044/1044
2018-05-25 15:31:30,743 : INFO : topic #8 (0.008): 0.018*"gene" + 0.012*"behavior" + 0.011*"mellifera" + 0.010*"animal" + 0.010*"species" + 0.009*"genes" + 0.008*"health" + 0.008*"apis" + 0.007*"pollen" + 0.007*"expression"
2018-05-25 15:31:30,743 : INFO : topic #73 (0.008): 0.024*"colonies" + 0.0

In [None]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

In [None]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [None]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:15]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [99]:
#Saves the top 5 topics and top 3 words per topic
with open("../data/Docbow/Top5TopicsTop3WordsTop="+str(numberOfTopics)+".tsv","w") as file:
    docs = [dictionary.doc2bow(a.split()) for a in abstracts] # Convert to bag of words format first
    for d in xrange(0,len(docs)):
        topics = lda.get_document_topics(bow=docs[d])
        topics = sorted(topics,key=lambda k:k[1], reverse=True)
        file.write(yearOfAbstract[d]+":\t")
        for t in topics[0:4]:
            for word in [dictionary[x[0]] for x in lda.get_topic_terms(t[0],topn=3)]:
                file.write(word+"\t")
        file.write("\n")

[[u'comprehensive',
  u'review',
  u'world',
  u'published',
  u'world',
  u'supplement',
  u'records',
  u'given',
  u'earlier',
  u'includes',
  u'new',
  u'reports',
  u'records',
  u'included',
  u'previous',
  u'articles',
  u'series',
  u'make',
  u'useful',
  u'incorporates',
  u'records',
  u'published',
  u'original',
  u'review',
  u'reference',
  u'status',
  u'almost',
  u'every',
  u'country'],
 [u'viruses',
  u'known',
  u'long',
  u'recently',
  u'attention',
  u'scientists',
  u'towards',
  u'relationship',
  u'viruses',
  u'parasitic',
  u'mite',
  u'varroa',
  u'jacobsoni',
  u'although',
  u'clinical',
  u'symptoms',
  u'indicated',
  u'viruses',
  u'hungary',
  u'none',
  u'previously',
  u'isolated',
  u'identified',
  u'july',
  u'unusual',
  u'brood',
  u'mortality',
  u'apiary',
  u'known',
  u'infested',
  u'varroa',
  u'jacobsoni',
  u'large',
  u'amounts',
  u'acute',
  u'paralysis',
  u'virus',
  u'healthy',
  u'pupae',
  u'killed',
  u'injection',
  u'extra