In [1]:
import string
import re
import os
import tempfile
import logging
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
numberOfTopics = 25

In [2]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [3]:
# Read in abstract year of publication, title of abstract, and abstract text
abstracts = []
titleOfAbstract = []
yearOfAbstract = []
for articles in entries:
    titleOfAbstract.append(articles[0])
    abstracts.append(articles[1]+articles[2]+articles[3])
    yearOfAbstract.append(articles[4][:-1])

In [4]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)

# Lowercase each document, split it by white space and filter out stopWords
texts = []
for document in abstracts:
    docwords = []
    for word in document.lower().split():
        word = re.sub(r'[^\w\s]','',word)
        word = re.sub(r'\.+$','',word)
        isNumber = re.compile('^[0-9]+$')
        if isNumber.search(word):
            word = ''
        if word not in stopList and word!='':
            docwords.append(word)
    texts.append(docwords)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
processedCorpus = [[token for token in text if frequency[token] > 5] for text in texts]

In [5]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

In [6]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [7]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics)

2018-05-02 09:05:38,592 : INFO : using symmetric alpha at 0.04
2018-05-02 09:05:38,593 : INFO : using symmetric eta at 0.04
2018-05-02 09:05:38,595 : INFO : using serial LDA version on this node
2018-05-02 09:05:38,610 : INFO : running online (single-pass) LDA training, 25 topics, 1 passes over the supplied corpus of 1044 documents, updating model once every 1044 documents, evaluating perplexity every 1044 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-02 09:05:39,777 : INFO : -10.155 per-word bound, 1140.1 perplexity estimate based on a held-out corpus of 1044 documents with 148749 words
2018-05-02 09:05:39,779 : INFO : PROGRESS: pass 0, at document #1044/1044
2018-05-02 09:05:40,576 : INFO : topic #8 (0.040): 0.012*"pollen" + 0.010*"pesticides" + 0.008*"colony" + 0.007*"pesticide" + 0.007*"mellifera" + 0.006*"colonies" + 0.006*"exposure" + 0.006*"apis" + 0.006*"effects" + 0.005*"study"
2018-05-02 09:05:40,578 : INFO : topic #9 (0.040): 0.011*"virus" + 0.009

In [9]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

44
mites
44
nurses
44
show
88
data
88
algorithm
88
beehive
84
protein
84
sucrose
84
survival
84
facilitated
74
bacteria
74
immune
74
environmental
86
expression
86
doses
86
queens
66
resins
66
bud
66
antimicrobial
106
food
106
brood
106
larval
55
colonies
55
virus
55
treatment
55
climate
55
subtropical
55
temperate
57
mercedesae
57
major
57
arthropod
57
channel
57
two
57
activated
62
analytical
62
method
62
products
107
pollen
107
species
107
seed
60
pollen
60
neonicotinoid
60
foraging
31
support
31
system
31
design
112
samples
112
monitoring
112
environmental
112
mortality
112
land
112
positively
112
colony
112
losses
112
presence
63
collapse
63
mainly
63
causes
81
physiological
81
enzyme
81
effects
92
sublethal
92
selenium
92
exposure
92
consuming
92
healthy
92
provides
126
gene
126
genes
126
expression
27
populations
27
destructor
27
mite
95
virus
95
mites
95
effects
43
paralysis
43
disease
43
virus
43
destructor
43
prevalence
43
stationary
58
variability
58
environmental
58
conditi

86
caste
70
viruses
70
use
70
application
70
availability
70
particles
70
virions
61
placed
61
repellent
61
treatment
67
goals
67
pesticides
67
nest
67
propolis
67
immune
67
also
67
research
67
use
67
architecture
37
maize
37
events
37
seed
82
genetic
82
managed
82
feral
69
colony
69
agricultural
69
weight
71
infection
71
bacteria
71
used
80
ranging
80
fungicides
80
almond
80
forager
80
face
80
utilized
80
completely
45
relies
45
minimum
45
production
45
acaricide
45
ppm
45
contact
65
infestation
65
colonies
65
destructor
56
efficacy
56
used
56
control
76
winter
76
pesticides
76
food
76
overwintered
92
infestation
92
ectoparasitic
92
correlate
92
mite
92
grooming
92
adult
47
forager
47
model
47
hive
83
inbreeding
83
microsatellite
83
compare
83
genetic
83
brood
83
population
74
wildflower
74
orchard
74
crop
74
flowering
74
visits
74
addressed
74
wild
74
pollinator
30
g
30
insecticides
30
cypermethrin
67
phase
67
induction
67
enzyme
67
worker
67
scientific
67
vitro
54
ppm
54
boscalid
54

94
health
94
gut
94
strains
49
obtained
49
rates
49
adult
83
host
83
establishment
83
ileum
83
diet
83
diets
83
development
83
corresponding
83
age
87
usually
87
flumethrin
87
situated
87
rape
87
oilseed
87
hives
82
analyse
82
series
82
suitability
82
defence
82
pesticide
82
effects
82
pathogen
103
revealing
103
reflect
103
cloning
103
genetic
103
ceranae
103
recombination
66
method
66
analytical
66
pollen
66
formic
66
acid
66
neonicotinoid
85
source
85
best
85
open
85
reproduction
53
mortality
53
mite
53
air
53
exposure
53
moderate
53
control
53
oils
53
oil
53
caused
73
leading
73
pesticide
73
elicit
73
light
73
acetylcholine
73
concerning
73
alterations
38
sensor
38
different
38
networks
63
pesticide
63
colony
63
wax
81
commercial
81
apiary
81
pollinator
40
genus
40
parasitoid
40
identified
40
infestation
40
condition
40
serious
57
survey
57
wax
57
east
57
invertebrate
57
percentage
57
inform
57
remote
57
population
57
virus
85
pollen
85
variations
85
nutritional
81
cerana
81
general

77
acute
48
appear
48
ecosystem
48
services
131
belonging
131
producing
131
compromise
131
locomotor
131
observed
131
sublethal
57
winter
57
crops
57
pollinator
75
arable
75
exposure
75
neonicotinoids
75
environmental
75
found
75
pollen
83
land
83
colony
83
cover
83
plots
43
males
43
sensory
43
female
71
conditions
71
mortality
71
role
71
effects
71
stressors
71
laboratory
57
worker
57
median
57
concentrations
79
colony
79
produce
79
turn
79
propolis
79
immune
79
envelope
64
mites
64
inoculated
64
mite
66
immune
66
effects
66
response
94
pathways
94
conditioned
94
changes
94
learning
94
memory
94
sublethal
57
virus
57
prevalence
57
distribution
57
detected
57
showing
57
pathogen
56
assigned
56
treatment
56
colonies
56
treated
56
infestation
56
decreases
38
pollen
38
acid
38
acids
29
used
29
colonies
29
commercial
29
select
29
destructor
52
colony
52
system
52
insecticides
39
continued
39
trading
39
winter
39
worker
39
commercial
39
colony
39
effect
39
spore
39
syrup
83
differences
83
s

82
giant
82
exposure
82
past
82
step
82
predicted
97
knowledge
97
marine
97
effects
80
latter
80
means
80
wild
80
professional
80
distance
80
reveal
80
colony
80
mellificae
80
strain
58
detection
58
method
58
capillary
53
pollen
53
liquid
53
wax
45
habitat
45
mite
45
mites
45
information
45
populations
45
pathogens
69
lines
69
available
69
protein
69
virus
69
capsids
105
opportunity
105
relationship
105
rates
105
virus
105
infestation
105
titres
86
seed
86
production
86
levels
86
adjacent
83
quantity
83
presence
83
controlled
83
feral
83
managed
83
wing
83
colonies
83
paired
83
measures
122
colony
122
queen
122
genetic
122
neonicotinoid
122
adverse
122
overwintered
98
considering
98
weight
98
logistic
98
destructor
98
mite
98
infestation
51
food
51
human
51
population
51
pest
51
integrated
51
management
98
load
98
visits
98
aims
98
unexplained
65
expression
65
gene
65
genes
65
involved
65
differences
65
included
65
diets
65
lipid
65
nectar
43
risk
43
assessment
43
use
47
exposure
47
pe

45
exposure
45
water
45
fewer
64
wax
64
majority
64
originating
64
pathogens
64
mellifera
64
parasites
85
synergistically
85
uptake
85
larval
85
plants
85
wild
85
ceranae
85
parasite
85
summer
85
winter
85
pathogen
85
colony
85
presence
57
surfactants
57
trisiloxane
57
liquid
90
pollen
90
physiology
90
nurse
90
ceranae
66
exclusion
66
results
66
effects
77
mite
77
mites
77
test
77
brood
77
detecting
77
apiary
95
colony
95
tool
95
policy
95
foraging
95
built
95
testing
99
pesticides
99
pollen
99
fungicide
99
exposed
99
found
99
high
40
oxalic
40
variations
40
usage
40
acid
40
causes
40
body
54
et
54
parasitized
54
parasitization
54
lead
54
declines
54
mechanisms
84
acts
84
collective
96
establishing
96
ileum
96
rapidly
96
immune
96
infections
96
gene
84
genes
84
innate
84
pathway
84
selective
84
selection
84
social
106
residues
106
environmental
106
pollen
106
veterinary
106
least
106
weakening
106
cultivated
106
pollens
106
unprecedented
50
entomology
50
paper
50
systems
50
multifactor

65
samples
65
larvae
65
locations
37
resistance
37
destructor
37
regions
69
sunflower
69
apiaries
69
residues
69
fipronil
69
field
69
pathogens
42
work
42
science
42
ways
52
land
52
cover
52
maps
52
services
52
efficiency
52
pollination
71
phase
71
propolis
71
gas
71
successfully
71
raw
71
attempts
85
cytochrome
85
detoxification
85
pyrethroid
85
devastating
85
polymerase
85
reverse
73
mites
73
sealed
73
mature
73
trait
73
developmental
73
phoretic
46
virus
46
paralysis
46
wing
46
colonies
46
destructor
46
pathogens
76
inbreeding
76
quantify
76
infestation
76
mite
76
rates
57
sugar
57
dusting
57
degree
57
infestation
93
designed
93
chemicals
93
ceranae
93
uninfected
93
infected
93
days
93
exposed
93
fipronil
69
current
69
silent
69
state
69
sodium
69
channels
69
tetramethrin
40
give
40
differentially
40
traits
40
sociality
50
brood
50
efficiently
50
candidate
50
comparing
50
care
50
social
50
genes
50
highly
56
fumagillin
56
method
56
residues
45
contamination
45
analytical
45
malathio

In [12]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [42]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:9]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")