In [1]:
import string
import re
import os
import tempfile
import logging
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
numberOfTopics = 25

In [1]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [2]:
# Read in abstract year of publication, title of abstract, and abstract text
abstracts = []
titleOfAbstract = []
yearofabstract = []
for articles in entries:
    titleOfAbstract.append(articles[0])
    abstracts.append(articles[1]+articles[2]+articles[3])
    yearofabstract.append(articles[4][:-1])

In [61]:
# Create a set of frequent words
stopFile = open("../data/stopWords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)

# Lowercase each document, split it by white space and filter out stopWords
texts = []
for document in abstracts:
    docwords = []
    for word in document.lower().split():
        word = re.sub(r'[^\w\s]','',word)
        word = re.sub(r'\.+$','',word)
        isNumber = re.compile('^[0-9]+$')
        if isNumber.search(word):
            word = ''
        if word not in stopList and word!='':
            docwords.append(word)
    texts.append(docwords)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
processedCorpus = [[token for token in text if frequency[token] > 5] for text in texts]





[['varroa',
  'mite',
  'varroa',
  'destructor',
  'ectoparasite',
  'apis',
  'mellifera',
  'pest',
  'apis',
  'mellifera',
  'yet',
  'reproductive',
  'biology',
  'host',
  'well',
  'understood',
  'particular',
  'significance',
  'phoretic',
  'stage',
  'mites',
  'feed',
  'adult',
  'days',
  'clear',
  'addition',
  'clear',
  'whether',
  'preference',
  'mites',
  'nurses',
  'observed',
  'laboratory',
  'also',
  'inside',
  'real',
  'colonies',
  'show',
  'varroa',
  'mites',
  'prefer',
  'nurses',
  'newly',
  'emerged',
  'colony',
  'setting',
  'determined',
  'mechanism',
  'behind',
  'preference',
  'show',
  'preference',
  'varroa',
  'fitness',
  'although',
  'due',
  'fact',
  'mite',
  'must',
  'find',
  'second',
  'host',
  'pupa',
  'reproduce',
  'fitness',
  'benefit',
  'mites',
  'immediate',
  'delayed',
  'results',
  'suggest',
  'varroa',
  'mite',
  'highly',
  'adapted',
  'parasite',
  'human',
  'experiment',
  'nurse',
  'parasite',
 

In [62]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

2018-04-02 12:35:42,458 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-04-02 12:35:42,748 : INFO : built Dictionary(3860 unique tokens: [u'limited', u'represent', u'believed', u'alleles', u'copy']...) from 1044 documents (total 148749 corpus positions)
2018-04-02 12:35:42,750 : INFO : saving Dictionary object under /tmp/words.dict, separately None
2018-04-02 12:35:42,754 : INFO : saved /tmp/words.dict


Dictionary(3860 unique tokens: [u'limited', u'represent', u'believed', u'alleles', u'copy']...)


In [63]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

[(112, 1), (333, 1)]


2018-04-02 12:35:42,963 : INFO : storing corpus in Matrix Market format to /tmp/words.dict
2018-04-02 12:35:42,965 : INFO : saving sparse matrix to /tmp/words.dict
2018-04-02 12:35:42,967 : INFO : PROGRESS: saving document #0
2018-04-02 12:35:43,283 : INFO : PROGRESS: saving document #1000
2018-04-02 12:35:43,310 : INFO : saved 1044x3860 matrix, density=2.473% (99651/4029840)
2018-04-02 12:35:43,311 : INFO : saving MmCorpus index to /tmp/words.dict.index


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 3), (33, 4), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 6), (58, 1), (59, 1), (60, 1)]
[(3, 1), (5, 1), (9, 1), (11, 1), (26, 1), (31, 1), (38, 1), (49, 1), (58, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 5), (69, 1), (70, 4), (71, 2), (72, 1), (73, 1), (74, 2), (75, 4), (76, 2), (77, 1), (78, 1), (79, 1), (80, 4), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1), (90, 7), (91, 4), (92, 1), (93, 1), (94, 1), (95, 1), (96, 2), (97, 1), (98, 1), (99, 1), (100, 1), (101, 3), (102, 2), 

[(5, 3), (10, 4), (25, 1), (31, 2), (39, 2), (99, 1), (144, 1), (182, 2), (183, 1), (184, 1), (205, 1), (214, 1), (264, 1), (273, 1), (297, 1), (351, 1), (400, 1), (405, 1), (453, 1), (454, 1), (466, 2), (467, 2), (469, 1), (499, 1), (519, 1), (633, 1), (638, 1), (672, 1), (815, 1), (816, 1), (817, 1), (820, 1), (854, 1), (882, 2), (903, 1), (930, 3), (943, 1), (952, 7), (970, 2), (987, 1), (1172, 1), (1218, 1), (1243, 1), (1269, 1), (1426, 1), (1457, 1), (1462, 1), (1478, 2), (1509, 3), (1571, 1), (1581, 1), (1584, 2), (1662, 1), (1836, 1), (1850, 1), (1860, 1), (1987, 1), (2083, 1), (2219, 1), (2416, 1), (2417, 2), (3067, 2), (3127, 1), (3576, 5), (3672, 2)]
[(3, 1), (10, 1), (12, 2), (18, 1), (35, 1), (59, 2), (63, 1), (72, 1), (87, 2), (103, 1), (121, 1), (174, 1), (180, 2), (194, 1), (195, 2), (216, 1), (221, 1), (232, 1), (245, 1), (268, 1), (269, 5), (282, 1), (283, 4), (286, 3), (295, 1), (301, 1), (317, 1), (319, 1), (365, 1), (379, 1), (381, 1), (400, 1), (401, 2), (409, 1), 

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [64]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics)

2018-04-02 12:35:43,554 : INFO : using symmetric alpha at 0.04
2018-04-02 12:35:43,556 : INFO : using symmetric eta at 0.04
2018-04-02 12:35:43,558 : INFO : using serial LDA version on this node
2018-04-02 12:35:43,578 : INFO : running online (single-pass) LDA training, 25 topics, 1 passes over the supplied corpus of 1044 documents, updating model once every 1044 documents, evaluating perplexity every 1044 documents, iterating 50x with a convergence threshold of 0.001000
2018-04-02 12:35:44,944 : INFO : -10.156 per-word bound, 1140.7 perplexity estimate based on a held-out corpus of 1044 documents with 148749 words
2018-04-02 12:35:44,945 : INFO : PROGRESS: pass 0, at document #1044/1044
2018-04-02 12:35:45,771 : INFO : topic #8 (0.040): 0.011*"colonies" + 0.009*"pesticide" + 0.009*"varroa" + 0.009*"nosema" + 0.009*"apis" + 0.007*"mellifera" + 0.007*"effects" + 0.006*"mite" + 0.006*"control" + 0.006*"animal"
2018-04-02 12:35:45,772 : INFO : topic #14 (0.040): 0.012*"virus" + 0.010*"col

In [65]:
# Get the topics and words associated with each document
doc = dictionary.doc2bow(abstracts[10].split()) # convert to bag of words format first
docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_wordTopics=True)

"[(u'gene', 0.008710126), (u'expression', 0.008555521), (u'mellifera', 0.0062374487), (u'animal', 0.0056878133), (u'pollination', 0.005526203), (u'pollinators', 0.0053344658), (u'genes', 0.0051210797), (u'effects', 0.005039662), (u'apis', 0.0046610953), (u'study', 0.0045568193)]"

In [66]:
#Sorts the word topics in decending order based on their greatest phi value

temp = phiValues
for x in xrange(0,len(temp)):
    temp[x][1].sort(key=lambda x:x[1],reverse=True)
temp.sort(key=lambda x:x[1][0][1],reverse=True)

[(798,
  [(13, 3.5792558),
   (21, 0.7304153),
   (17, 0.5553321),
   (19, 0.12317756),
   (2, 0.011819055)]),
 (357,
  [(13, 2.8713355),
   (21, 0.8630693),
   (17, 0.6806548),
   (19, 0.5367225),
   (2, 0.048218135)]),
 (795, [(13, 2.800594), (21, 0.66940325), (19, 0.35286748), (17, 0.1703013)]),
 (780,
  [(13, 2.7781792),
   (21, 1.2586436),
   (17, 0.5262415),
   (19, 0.39751536),
   (2, 0.03942025)]),
 (428,
  [(17, 2.386383),
   (13, 2.1154346),
   (21, 0.91762453),
   (19, 0.5465713),
   (2, 0.033986263)]),
 (590,
  [(13, 2.2190588),
   (21, 0.6724572),
   (17, 0.57146436),
   (19, 0.5110538),
   (2, 0.02596591)]),
 (812,
  [(17, 2.1186976),
   (13, 2.0307899),
   (21, 0.51730955),
   (19, 0.2520657),
   (2, 0.081137285)]),
 (818,
  [(13, 2.0758586),
   (21, 1.2314767),
   (19, 0.86490333),
   (17, 0.8038792),
   (2, 0.02388222)]),
 (595,
  [(13, 2.0228827),
   (17, 0.401571),
   (21, 0.34241518),
   (19, 0.20630702),
   (2, 0.026824152)]),
 (802,
  [(13, 1.5448701),
   (17, 0.9

In [68]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_wordTopics=True)
    topicOrganizingFile.write(yearofabstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

44
mites
44
host
44
nurses
88
data
88
algorithm
88
sensor
84
protein
84
sucrose
84
food
74
bacteria
74
immune
74
potential
86
expression
86
queens
86
doses
66
resins
66
bud
66
antimicrobial
106
brood
106
food
106
neonicotinoids
55
colonies
55
virus
55
climate
57
mercedesae
57
channel
57
activated
57
fifth
62
analytical
62
limits
62
products
62
method
62
solution
62
cultivation
107
mixes
107
species
107
meadows
107
pollen
107
resource
107
design
107
included
60
plants
60
growing
60
pollinator
60
neonicotinoid
60
levels
60
pollution
60
pollen
60
foraging
60
pyrethroid
31
relevant
31
support
31
system
112
colony
112
mortality
112
health
63
collapse
63
colonies
63
pesticides
81
strain
81
strains
81
dietary
81
fipronil
81
environmental
81
focused
81
physiological
81
enzyme
81
effects
92
selenium
92
sublethal
92
memory
126
genes
126
expression
126
gene
126
stable
126
samples
126
pest
126
quality
126
chain
126
polymerase
27
discusses
27
survival
27
explanations
27
responsible
27
populations
2

80
fungicides
80
effects
80
label
80
forager
80
constant
80
simulated
45
ppm
45
toxicity
45
contact
65
stronger
65
predominant
65
infestation
65
colonies
65
destructor
65
host
56
none
56
previously
56
extracts
56
oxalic
56
efficacy
56
control
76
beeswax
76
fungicides
76
destructor
76
winter
76
food
76
colony
92
grooming
92
mite
92
survival
92
infestation
92
scutellata
92
origin
92
brood
92
hygienic
92
assessed
47
forager
47
model
47
hive
83
genetic
83
brood
83
mites
74
spatially
74
expressed
74
engineering
74
shifts
74
wildflower
74
visitation
74
almond
30
g
30
insecticides
30
indoxacarb
30
application
30
toxicity
30
conventional
67
formetanate
67
depends
67
larvae
67
concentration
67
oxidative
67
food
54
ppm
54
larvae
54
pollen
54
boscalid
54
double
54
immature
63
wild
63
data
63
mixes
63
summarize
63
managed
63
native
109
beeswax
109
beekeeping
109
risk
109
organic
109
sector
109
regulate
52
chlorpyrifos
52
mellifera
52
acute
58
pheromones
58
destructor
58
effect
89
residue
89
chemic

46
oil
46
oregano
46
destructor
35
pollens
35
pesticide
35
method
35
crude
110
concentrations
110
temperature
110
site
110
colonies
110
fed
110
brood
88
bacterium
88
bacterial
88
found
88
sequence
88
previously
88
genetic
88
pollen
88
collected
88
terrestris
90
regions
90
minimal
90
pg
90
concentrations
90
concentration
103
gene
103
expression
103
mites
58
activity
58
responses
58
odors
58
organ
58
space
58
restricted
58
nurse
58
ether
58
locate
44
beeswax
44
residues
44
exposure
44
veterinary
44
acceptable
44
theoretical
75
flight
75
behaviour
75
foraging
75
immune
75
traits
75
influence
63
practices
63
different
63
colony
63
associate
63
constructed
63
explained
63
winter
63
losses
63
management
39
associated
39
highlight
39
results
48
national
48
health
48
developed
63
pesticide
63
residues
63
polar
63
mass
63
concentration
63
using
55
protein
55
pollen
55
diet
58
chain
58
polymerase
58
mitochondrial
63
colony
63
reported
63
losses
114
together
114
details
114
bumble
114
neonicotino

74
learning
74
memory
74
threaten
74
detected
74
chlorpyrifos
74
pesticide
118
may
118
combinations
118
ivermectin
118
vivo
118
head
118
domain
44
indigenous
44
species
44
native
76
acaricide
76
citrus
76
mortality
76
front
76
entrance
92
agricultural
92
pesticide
92
detected
48
risks
48
environmental
48
pesticide
77
employed
77
sex
77
issues
77
targets
77
show
77
gene
77
enrichment
89
assay
89
studied
89
correlation
89
reactions
89
reliable
89
capillary
54
pollen
54
levels
54
young
54
sublethal
54
increasing
90
mortality
90
risk
90
final
53
detection
53
method
53
spores
53
many
53
pathogens
53
products
53
standardized
53
multiplex
53
objective
46
effect
46
chemicals
46
storage
46
mellonella
46
initial
46
population
46
mortality
46
affected
20
veterinary
20
medicinal
20
procedures
20
availability
20
member
20
control
70
oxalic
70
acid
70
effect
70
bottom
70
amount
70
successful
70
difference
70
substantial
76
label
76
blooming
76
fungicides
76
fields
76
treated
76
set
76
brood
76
colon

95
gene
95
genes
95
antiviral
95
old
56
virus
56
detected
56
mite
56
never
56
explanation
95
water
95
samples
95
soil
95
drift
95
unintended
95
move
95
fresh
54
sperm
54
drone
54
genetic
34
research
34
light
34
composite
34
genome
34
microbiota
34
improved
34
diversity
67
losses
67
agricultural
67
health
67
interfering
67
small
67
antiviral
67
combat
67
highlight
67
affected
50
produced
50
producers
50
market
50
media
50
specialty
50
regarding
50
consumers
50
opportunity
50
niche
99
transgenic
99
scientific
99
rely
99
use
99
failure
99
human
99
kunkeei
99
gut
99
candidate
52
sampling
52
sticky
52
using
47
rates
47
infestation
47
destructor
47
royal
47
colonies
47
producing
71
proliferation
71
failure
71
remained
71
factors
71
management
71
colonies
80
classification
80
samples
80
pesticides
80
residue
72
social
72
time
72
animal
77
displayed
77
unknown
77
propolis
77
resins
77
antimicrobial
45
oral
45
toxicity
45
acaricide
49
pollen
49
determine
49
species
49
fatty
49
acids
49
traps
48

23
pollination
23
network
23
parameters
48
pathogens
48
colony
48
diseases
62
infected
62
acquisition
62
transmission
62
virus
62
using
62
declined
62
pupae
62
mite
62
rare
64
conserved
64
geographical
64
directed
64
reference
64
shared
64
variations
64
genotypes
64
virus
64
complete
55
incidents
55
data
55
major
72
insecticide
72
thiamethoxam
72
used
72
recovery
72
electron
72
continuous
74
colonies
74
invasion
74
mites
74
rates
74
neighboring
74
record
68
two
68
seen
68
climate
68
genetic
68
isolates
68
field
68
zone
32
virus
32
paralysis
32
wing
32
aimed
32
chronic
32
reasons
38
damage
38
global
38
long
38
defining
38
success
38
utilized
81
vivo
81
pirimicarb
81
data
51
eliminate
51
employing
51
incidents
51
samples
51
pesticides
51
method
44
viral
44
viruses
44
also
95
supply
95
demand
95
pollination
95
pollinators
95
relation
95
visitation
95
mapped
95
service
40
mellifera
40
toxicity
40
insecticides
67
nicotine
67
resist
67
body
67
stores
67
development
61
ceranae
61
protein
61
w

59
colonies
59
mite
59
adult
120
rape
120
pollen
120
nectar
120
neonicotinoids
120
proximity
120
intensively
114
chapter
114
owing
114
presence
114
different
114
detected
114
resistant
114
except
114
wing
67
colony
67
equipped
67
methodologies
67
sowing
67
effects
67
maize
69
food
69
health
69
antibiotic
69
contamination
69
antibiotics
69
might
69
challenging
69
medicinal
69
pollutants
60
guts
60
bacterial
60
adult
60
surveyed
60
abundances
60
possibility
89
gene
89
genetic
89
mites
89
limits
89
map
103
chromosome
103
genotypes
103
receptor
103
map
103
sensitive
103
trait
103
genes
103
previously
103
worldwide
43
colonies
43
mite
43
varroa
95
ecosystem
95
services
95
bumblebees
95
worker
95
exposure
95
colony
47
traits
47
resistance
47
observed
78
virus
78
viruses
78
viral
61
upregulation
61
significant
61
injection
61
losses
61
genes
61
larvae
70
ceranae
70
apis
70
hives
61
poor
61
effects
61
nutrition
61
vulnerable
61
enhance
61
nutritionally
72
pest
72
research
72
many
72
mite
72
de

In [69]:
#Makes the top nine terms for each document

topNineFile = open("../data/topnineterms.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_wordTopics=True)
    topNineFile.write(yearofabstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:9]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")