In [37]:
import string
import re
import os
import tempfile
import logging
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
numberOfTopics = 200

In [38]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [39]:
# Read in abstract year of publication, title of abstract, and abstract text
abstracts = []
titleOfAbstract = []
yearOfAbstract = []
for articles in entries:
    titleOfAbstract.append(articles[0])
    abstracts.append(articles[1]+articles[2]+articles[3])
    yearOfAbstract.append(articles[4][:-1])

In [40]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)

# Lowercase each document, split it by white space and filter out stopWords
texts = []
for document in abstracts:
    docwords = []
    for word in document.lower().split():
        word = re.sub(r'[^\w\s]','',word)
        word = re.sub(r'\.+$','',word)
        isNumber = re.compile('^[0-9]+$')
        if isNumber.search(word):
            word = ''
        if word not in stopList and word!='':
            docwords.append(word)
    texts.append(docwords)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
processedCorpus = [[token for token in text if frequency[token] > 5] for text in texts]

In [41]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

2018-05-21 18:57:41,970 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-05-21 18:57:42,157 : INFO : built Dictionary(3860 unique tokens: [u'limited', u'represent', u'believed', u'alleles', u'copy']...) from 1044 documents (total 148749 corpus positions)
2018-05-21 18:57:42,157 : INFO : saving Dictionary object under d:\temp\words.dict, separately None
2018-05-21 18:57:42,161 : INFO : saved d:\temp\words.dict


In [42]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

2018-05-21 18:57:42,332 : INFO : storing corpus in Matrix Market format to d:\temp\words.dict
2018-05-21 18:57:42,335 : INFO : saving sparse matrix to d:\temp\words.dict
2018-05-21 18:57:42,338 : INFO : PROGRESS: saving document #0
2018-05-21 18:57:42,785 : INFO : PROGRESS: saving document #1000
2018-05-21 18:57:42,805 : INFO : saved 1044x3860 matrix, density=2.473% (99651/4029840)
2018-05-21 18:57:42,806 : INFO : saving MmCorpus index to d:\temp\words.dict.index


The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [43]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics)

2018-05-21 18:57:45,073 : INFO : using symmetric alpha at 0.005
2018-05-21 18:57:45,075 : INFO : using symmetric eta at 0.005
2018-05-21 18:57:45,078 : INFO : using serial LDA version on this node
2018-05-21 18:57:45,164 : INFO : running online (single-pass) LDA training, 200 topics, 1 passes over the supplied corpus of 1044 documents, updating model once every 1044 documents, evaluating perplexity every 1044 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-21 18:57:48,818 : INFO : -28.517 per-word bound, 384215018.5 perplexity estimate based on a held-out corpus of 1044 documents with 148749 words
2018-05-21 18:57:48,819 : INFO : PROGRESS: pass 0, at document #1044/1044
2018-05-21 18:57:51,676 : INFO : topic #171 (0.005): 0.012*"species" + 0.012*"bacterial" + 0.012*"spiroplasma" + 0.010*"pollen" + 0.009*"genome" + 0.008*"apis" + 0.007*"bacteria" + 0.007*"lactobacillus" + 0.007*"analysis" + 0.006*"food"
2018-05-21 18:57:51,677 : INFO : topic #70 (0.005): 0.016*

In [8]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

44
mites
44
preference
44
show
88
data
88
classification
88
important
88
algorithm
88
beehive
88
sensor
88
network
88
monitoring
84
foraging
84
protein
84
sucrose
84
dietary
74
cost
74
often
74
storage
74
immune
74
bacteria
74
strains
86
expression
86
queens
86
viability
66
resins
66
bud
66
antimicrobial
106
thiacloprid
106
parts
106
colonies
106
food
106
brood
106
larval
106
jelly
106
concentrations
106
delivery
55
interactions
55
isolation
55
presence
55
colonies
55
treatment
55
subtropical
57
mercedesae
57
channel
57
two
57
compounds
57
receptor
57
apiculture
62
analytical
62
limits
62
method
107
pollen
107
mixes
107
flower
60
mays
60
diversity
60
highest
60
contribution
60
sources
60
pollen
60
neonicotinoid
60
foraging
31
effective
31
system
31
design
112
mortality
112
paralysis
112
samples
112
colony
112
health
112
hive
63
colonies
63
possible
63
described
63
collapse
63
pesticides
63
studies
81
abdomen
81
differentially
81
show
81
physiological
81
enzyme
81
effects
92
selenium
92

In [44]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [12]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:9]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [45]:
with open('../data/Docbow/TopFifteenFor'+str(numberOfTopics)+'.tsv','w') as top15:
    for i in range(0,5):
        [top15.write(word + ',') for word in topicWords[i][0:3]]