In [13]:
import string
import re
import os
import tempfile
import logging
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from collections import namedtuple
import gensim.parsing.preprocessing as processing
#Number of topics the model should index
numberOfTopics = 125
#Number of passes the model should make
passes = 10
MyAbstract = namedtuple('MyAbstract',["title","date","text"])

In [6]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [7]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
processing.STOPWORDS = stopWords
def removeStops(text):
    return processing.remove_stopwords(text.lower().translate(None, string.punctuation))
# List of lists in 
abstracts = [MyAbstract._make([art[0],datetime.strptime(art[4][:-1], '%Y-%m-%d'),removeStops((art[1]+art[2]+art[3]))]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for abst in abstracts:
    for token in abst[2].split(" "):
        frequency[token] += 1
processedCorpus = [[token for token in abst[2].split(" ") if frequency[token] > 5] for abst in abstracts]

  return " ".join(w for w in s.split() if w not in STOPWORDS)


In [10]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

In [11]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [14]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics,passes=passes)

2018-06-04 19:42:34,569 : INFO : using symmetric alpha at 0.008
2018-06-04 19:42:34,572 : INFO : using symmetric eta at 0.008
2018-06-04 19:42:34,575 : INFO : using serial LDA version on this node
2018-06-04 19:42:34,683 : INFO : running online (multi-pass) LDA training, 125 topics, 10 passes over the supplied corpus of 1044 documents, updating model once every 1044 documents, evaluating perplexity every 1044 documents, iterating 50x with a convergence threshold of 0.001000
2018-06-04 19:42:37,869 : INFO : -22.686 per-word bound, 6749128.9 perplexity estimate based on a held-out corpus of 1044 documents with 116344 words
2018-06-04 19:42:37,872 : INFO : PROGRESS: pass 0, at document #1044/1044
2018-06-04 19:42:40,542 : INFO : topic #80 (0.008): 0.015*"colony" + 0.014*"pesticide" + 0.010*"imidacloprid" + 0.009*"thiamethoxam" + 0.007*"insecticide" + 0.007*"larval" + 0.007*"insecticides" + 0.006*"larvae" + 0.006*"concentration" + 0.005*"sonication"
2018-06-04 19:42:40,543 : INFO : topic #

2018-06-04 19:42:58,345 : INFO : topic #41 (0.008): 0.012*"range" + 0.012*"mg" + 0.011*"validation" + 0.010*"limits" + 0.010*"al" + 0.010*"future" + 0.010*"©" + 0.010*"001" + 0.010*"et" + 0.009*"residue"
2018-06-04 19:42:58,346 : INFO : topic #73 (0.008): 0.033*"varroa" + 0.020*"mite" + 0.018*"destructor" + 0.016*"acari" + 0.014*"interaction" + 0.013*"apidae" + 0.013*"parasites" + 0.012*"newfoundland" + 0.011*"hostparasite" + 0.010*"canada"
2018-06-04 19:42:58,351 : INFO : topic #18 (0.008): 0.024*"mites" + 0.017*"varroa" + 0.010*"reproduction" + 0.010*"male" + 0.009*"flight" + 0.008*"brood" + 0.008*"sealed" + 0.008*"©" + 0.007*"mitochondria" + 0.007*"insecticide"
2018-06-04 19:42:58,352 : INFO : topic #1 (0.008): 0.017*"nicotine" + 0.011*"neonicotinoid" + 0.010*"colony" + 0.009*"queens" + 0.009*"pesticide" + 0.008*"male" + 0.008*"coli" + 0.007*"thiamethoxam" + 0.007*"©" + 0.007*"genetic"
2018-06-04 19:42:58,357 : INFO : topic diff=inf, rho=0.377964
2018-06-04 19:43:00,611 : INFO : -8.

In [None]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

In [None]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [None]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for abstr in abstracts:
    doc = dictionary.doc2bow(abstracts[2].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+abst+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:15]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [99]:
#Saves the top 5 topics and top 3 words per topic
with open("../data/Docbow/Top5TopicsTop3WordsTop="+str(numberOfTopics)+".tsv","w") as file:
    docs = [dictionary.doc2bow(a.split()) for a in abstracts] # Convert to bag of words format first
    for d in xrange(0,len(docs)):
        topics = lda.get_document_topics(bow=docs[d])
        topics = sorted(topics,key=lambda k:k[1], reverse=True)
        file.write(yearOfAbstract[d]+":\t")
        for t in topics[0:4]:
            for word in [dictionary[x[0]] for x in lda.get_topic_terms(t[0],topn=3)]:
                file.write(word+"\t")
        file.write("\n")

In [9]:
abstracts[0]

MyAbstract(title='World bee health update 1996', date=datetime.datetime(1996, 1, 1, 0, 0), text=u'comprehensive review world published world 199310 updated 199511 supplement correct records given earlier brief update includes new reports records included previous articles series make useful incorporates records published 1995 update read original review reference status almost every country worldapis')