In [1]:
import string
import re
import os
import tempfile
import logging
import csv
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from collections import namedtuple
import gensim.parsing.preprocessing as processing
from os import listdir
import CustomApi as api
#Start Global Variables and Types

#document dir
docs = "../data/documents/"

#Number of topics the model should index
numberOfTopics = 125

#Number of passes the model should make
passes = 10

#End Global Variables and Types



In [17]:
files = listdir(docs+"lt2006/")
entries = []

In [18]:
for file in files:
    with open(docs+"lt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
        

In [19]:
#title of an abstract
entries[0][1]


'Evidence of the Juvenile Hormone Methyl(2E,6E)-10,11-epoxy-3,7,11-trimethyl-2,6-dodecadienoate(JH-3) in Insects of Four Orders'

In [20]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines() #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend([u"\u2019",u"\u03bc","bee","bees","honey","honeybee","honeybees",u"\xa9",u"\xc2"])
stopList = set(stopWords)
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
processing.STOPWORDS = stopWords
def removeStops(text):
    return processing.stem_text(processing.remove_stopwords(text.lower().translate(None, string.punctuation)))
# Each abstract has a 'title':String, 'date':datetime.datetime, and 'text':String
abstracts = [api.MyAbstract._make([art[1],datetime.strptime(art[2], '%Y'),removeStops(art[15])]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)
entries = None
# Count word frequencies
selections = []

access = lambda x: x.date.year
lastIndex = 0
for i in range(1957,2007):
    index = api.binarySearch(abstracts,i,access)
    if  index != -1:
        selections.append(abstracts[lastIndex:index+1])
        lastIndex = index+1
        

In [21]:
from collections import defaultdict
def createCorpus(selection):
    frequency = defaultdict(int)
    for abst in abstracts:
        for token in abst.text.split(" "):
            frequency[token] += 1
    tempFolder = tempfile.gettempdir()
    processedCorpus = [[token for token in abst.text.split(" ") if frequency[token] > 5] for abst in selection]
    dictionary = corpora.Dictionary(processedCorpus)
    dictionary.save(os.path.join(tempFolder,'words.dict'))
    # Create general corpus and serialize in order for it to be iterated over
    corpus = [dictionary.doc2bow(text) for text in processedCorpus]
    corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    return api.MyCorpora._make([corpus,dictionary])
    


# Save the dictionary of tokens
# def createModel():
#     tempFolder = tempfile.gettempdir()
#     dictionary = corpora.Dictionary(processedCorpusFor(selection))
#     dictionary.save(os.path.join(tempFolder,'words.dict'))
#     # Create general corpus and serialize in order for it to be iterated over
#     corpus = [dictionary.doc2bow(text) for text in processedCorpus]
#     corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    # Train the model and set number of topics


In [22]:


def exportResults(path, sortedVals):
    with open(path+"nodes.csv","w") as file:
        file.write("Id,Label,Weight\n")
        for val in sortedVals:
            file.write(str(val["tag"])+","+val["word"].encode('utf8')+','+str(val['occurences'])+'\n')
    with open(path+'edges.csv',"w") as file:
        file.write("Source,Target\n")
        for val in sortedVals:
            for i in range(0,len(val["words"])):
                file.write(str(val["tag"])+","+str(val["words"][i])+"\n")

In [23]:
import math as m
def getResults(topics):
    occurences = {}
    id_tag = 0
    topics[len(topics)-1]
    for topic in topics:
        words = topic[0]
        for w in words:
            if w[1] not in occurences.keys():
                tokens = [occurences[t]["tag"] if t in occurences.keys() else t for t in [token[1] for token in words]]
                tokens.remove(w[1])
                occurences[w[1]] = {"prob":w[0],"occurences":1,"words":tokens, "tag" : id_tag, "word":w[1]}
                id_tag += 1
            else:
                occurences[w[1]]["occurences"] += 1
                occurences[w[1]]["prob"] = max(occurences[w[1]]["prob"], w[0])
    for val in occurences.values():
        nodes = val["words"]
        for i in range(0,len(nodes)):
            if isinstance(nodes[i],basestring):
                nodes[i] = occurences[nodes[i]]["tag"]
        occurences[val["word"]]["words"] = nodes
    return occurences


In [None]:
%%capture
size = 0
start = 0
import os
occurences = None
for i in range(0,len(selections)):
    size += len(selections[i])
    if size >=50:
        selection = api.flaten(selections[start:i+1])
        corpus = createCorpus(selection)
        path = docs+'GephiFiles/'+str(selection[0].date.year)+'/'
        try:
            os.makedirs(path)
            print "made dirs"
        except:
            #Already exists
            pass
        
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        lda = models.ldamodel.LdaModel(corpus.corpus,id2word=corpus.id2word,num_topics=numberOfTopics,passes=passes)
        topics = lda.top_topics(corpus=corpus.corpus,dictionary=corpus.id2word,topn=10)
        topics.sort(key=lambda k:k[1],reverse=True)
        occurences = getResults(topics)
        sortedVals = sorted(occurences.values(),key=lambda k:k["tag"])
        #####Release some of this memory please and thank you
        selection = None
        occurences = None
        #####
        exportResults(path,sortedVals)

        size = 0
        start = i+1

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [None]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

In [None]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [None]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for abstr in abstracts:
    doc = dictionary.doc2bow(abstracts[2].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+abst+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:15]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [None]:
first = api.flaten(yearlyCorpora[0])
first.sort(key=lambda x: x[0])
print api.binarySearch(first,0,lambda x: x[0])
print first

In [35]:
vals = occurences.values()
vals.sort(key=lambda x:x['occurences'],reverse=True)
print vals[0]

{'term': u'venom', 'words': [u'3', u'dai', u'white', u'prior', u'mice', u'period', u'death', u'inject', u'protein'], 'prob': 0.06478571, 'occurences': 47}


In [65]:
sortedVals = sorted(occurences.values(),key=lambda k:k["tag"])


In [14]:
"\xa9" in stopWords


True