In [1]:
import string
import re
import os
import tempfile
import logging
import csv
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from collections import namedtuple
import gensim.parsing.preprocessing as processing
from os import listdir
import CustomApi as api
#Start Global Variables and Types

#document dir
docs = "../data/documents/"

#Number of topics the model should index
numberOfTopics = 225

#Number of passes the model should make
passes = 75

#Keyphrase tracker
keyphraseTracker = api.KeyWordTracker()

#End Global Variables and Types



In [2]:
files = listdir(docs+"lt2006/")
entries = []

In [3]:
for file in files:
    with open(docs+"lt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
        

In [4]:
#title of an abstract
entries[0][1]


'Evidence of the Juvenile Hormone Methyl(2E,6E)-10,11-epoxy-3,7,11-trimethyl-2,6-dodecadienoate(JH-3) in Insects of Four Orders'

In [5]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines() #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend([u"\u2019",u"\u03bc","bee","bees","honey","honeybee","honeybees",u"\xa9",u"\xc2"])
# for asc in range(97,123):
#     stopWords.extend([chr(asc)])
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
stopList = set(stopWords)
processing.STOPWORDS = stopList
ps = PorterStemmer()
def removeStops(text):
    stopsRemoved = processing.remove_stopwords(text.lower().translate(None, string.punctuation))
    words = stopsRemoved.split(" ")
    stemmedWords = []
    for w in words:
        if len(ps.stem(w)) > 2:
            stemmedWords.append(ps.stem(w))
    return ' '.join(stemmedWords)
# Each abstract has a 'title':String, 'date':datetime.datetime, 'text':String, and 'keywords':String
abstracts = [api.MyAbstract._make([art[1],datetime.strptime(art[2], '%Y'),removeStops(art[15]), art[16]]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)
entries = None
# Count word frequencies
selections = []

access = lambda x: x.date.year
lastIndex = 0
for i in range(1957,2007):
    index = api.binarySearch(abstracts,i,access)
    if  index != -1:
        selections.append(abstracts[lastIndex:index+1])
        lastIndex = index+1
        

In [6]:
from collections import defaultdict
def createCorpus(selection):
    frequency = defaultdict(int)
    for abst in abstracts:
        for token in abst.text.split(" "):
            frequency[token] += 1
    tempFolder = tempfile.gettempdir()
    processedCorpus = [[token for token in abst.text.split(" ") if frequency[token] > 5] for abst in selection]
    dictionary = corpora.Dictionary(processedCorpus)
    dictionary.save(os.path.join(tempFolder,'words.dict'))
    # Create general corpus and serialize in order for it to be iterated over
    corpus = [dictionary.doc2bow(text) for text in processedCorpus]
    corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    return api.MyCorpora._make([corpus,dictionary])
    


# Save the dictionary of tokens
# def createModel():
#     tempFolder = tempfile.gettempdir()
#     dictionary = corpora.Dictionary(processedCorpusFor(selection))
#     dictionary.save(os.path.join(tempFolder,'words.dict'))
#     # Create general corpus and serialize in order for it to be iterated over
#     corpus = [dictionary.doc2bow(text) for text in processedCorpus]
#     corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    # Train the model and set number of topics


In [7]:
numTopicsExport = 50

def exportResults(path, sortedVals,wordDict):
    seg = sortedVals[0][0:numTopicsExport]
    #Edges and their weights [(a,b):weight]
    edges = sortedVals[1]
    # set of (a,b)
    exportEdges = set({})
    exportNodes = set({})
    for node in seg:
        exportNodes.add((node["tag"],node["word"],node["occurences"]))
        for e in node["edges"]:
            a = wordDict[e[0]]
            b = wordDict[e[1]]
            if a in seg and b in seg:
                exportEdges.add(e)
                exportNodes.add((a["tag"],a["word"],a["occurences"]))
                exportNodes.add((b["tag"],b["word"],b["occurences"]))
            
    with open(path+"nodes.csv","w") as file:
        file.write("Id,Label,Weight\n")
        for val in exportNodes:
            file.write(str(val[0])+","+val[1].encode('utf8')+','+str(val[2])+'\n')
    with open(path+'edges.csv',"w") as file:
        file.write("Source,Target,Weight\n")
        #We dont want to have duplicate edges written, so we want to make an aedge buffer and then write the buffer
        for e in exportEdges:
            file.write(str(e[0])+","+str(e[1])+","+str(edges[e])+"\n")
#         edgesToWrite = set([])
#         for val in seg:
#             #Edges a node has
#             for edge in val["edges"]:
#                 #Check if (a,b) is an edge else if (b,a) is an edge. The graph is undirected so we dont want duplicates
#                 if edge in edges.keys():
#                     #Edge wasnt in the buffer, add it
#                     if edge not in edgesToWrite:
#                         edgesToWrite.add(edge)
#                 #(b,a) in edge list?
#                 elif (edge[1],edge[0]) in edges.keys():
#                     #edge wasnt in the buffer
#                     edgesToWrite.add(edge)
#         for edge in edgesToWrite:
#             file.write(str(edge[0])+","+str(edge[1])+","+str(edges[edge])+"\n")

In [8]:
import math as m
def getResults(topics):
    mapper = api.WordMapper()
    #Edges [(a,b):weight]
    edges = {}
    def updateEdgeFreq(vec1):
        if vec1[0] == vec1[1]:
            return None
        altVec = [vec1[1],vec1[0]]
        if vec1 in edges.keys():
            edges[vec1] += 1
            return vec1
        elif altVec in edges.keys():
            edges[altVec] += 1
            return altVec
        else:
            edges[vec1] = 1
            return vec1
    occurences = {}
    topics[len(topics)-1]
    for topic in topics:
        words = topic[0]
        #make all the edges for this topic
        wordEdges = []
        wid = mapper.mapWord(words[0][1])
        for t in words:
            edge = updateEdgeFreq((wid,mapper.mapWord(t[1])))
            if edge != None:
                wordEdges.append(edge)
        #add the edges to each word and and update the number of occurences per word
        for w in words:
            word = w[1]
            wid = mapper.mapWord(word)
            if wid not in occurences.keys():
                occurences[wid] = {"prob":w[0],"occurences":1,"edges":wordEdges, "tag" : wid, "word":word}
            else:
                occurences[wid]["occurences"] += 1
                occurences[wid]["prob"] = max(occurences[wid]["prob"], w[0])
    return [occurences,edges]


In [23]:
%%capture
from difflib import SequenceMatcher
size = 0
start = 0
import os
occurences = None
try:
    for i in range(0,len(selections)):
        #keyphraseTracker = api.KeyWordTracker()
        size += len(selections[i])
        if size >=50:
            selection = api.flaten(selections[start:i+1])
            for abstr in selection:
                phrases = [k.strip() for k in  abstr.keywords.split(";")]
                keyPhrases = []
                for phr in phrases:
                    words = [ps.stem(w) if all(ord(c) < 128 for c in w) else None for w in phr.split(" ")]
                    string = ""
                    for w in words:
                        if w != None:
                            string = string +" "+ w
                    string = string.strip()
                    if len(string) > 2:
                        keyPhrases.append(string)
                for kph in keyPhrases:    
                    keyphraseTracker.track(kph,abstr.date.year)
                    keyphraseTracker.registerCoOccurrences(kph,keyPhrases)
            #continue to next section
            corpus = createCorpus(selection)
            path = docs+'GephiFiles/'+str(selection[0].date.year)+'/'
            try:
                os.makedirs(path)
                print "made dirs"
            except:
                #Already exists
                pass
            logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
            lda = models.ldamodel.LdaModel(corpus.corpus,id2word=corpus.id2word,num_topics=numberOfTopics,passes=passes)
            topics = lda.top_topics(corpus=corpus.corpus,dictionary=corpus.id2word,topn=3)
            
            topics.sort(key=lambda k:k[1],reverse=True)
            occurences = getResults(topics)
            sortedVals = [sorted(occurences[0].values(),key=lambda k:k["occurences"],reverse=True),occurences[1]]
            #####Release some of this memory please and thank you
            selection = None
            #occurences = None
            #####
            exportResults(path,sortedVals,occurences[0])
            size = 0
            start = i+1
    os.system("emailme \"Finished successfully!\"")
except Exception as e:
    os.system("emailme \"Failed come check it out: "+str(e)+" \"")
    

KeyboardInterrupt: 

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [10]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()
        

NameError: name 'dictionary' is not defined

In [None]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [None]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for abstr in abstracts:
    doc = dictionary.doc2bow(abstracts[2].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+abst+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:15]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [None]:
first = api.flaten(yearlyCorpora[0])
first.sort(key=lambda x: x[0])
print api.binarySearch(first,0,lambda x: x[0])
print first

In [None]:
vals = occurences.values()
vals.sort(key=lambda x:x['occurences'],reverse=True)
print vals[0]

In [None]:
sortedVals = sorted(occurences.values(),key=lambda k:k["tag"])


In [23]:
top50 = keyphraseTracker.topN()
keyphraseNodes = {}
for yo in top50:
    keyphraseNodes[yo.getPhrase()] = yo.sum()
keyphraseEdges = []
for yo in top50:
    for (phr,occur) in yo.cooccurringPhrases.iteritems():
        #rint (phr,occur)
        if phr in keyphraseNodes.keys():
            keyphraseEdges.append((yo.getPhrase(),phr,occur))



(u'api mellifera', 'nectar', 10)


In [32]:
keyphraseMapper = api.WordMapper()
with open("../data/documents/GephiFiles/KeyWords/nodes.csv","w") as nodesFile:
    nodesFile.write("Id,Label,Weight\n")
    for (phr,occur) in keyphraseNodes.iteritems():
        id = keyphraseMapper.mapWord(phr)
        nodesFile.write(str(id)+","+phr+","+str(occur)+"\n")
mappedEdges = [(keyphraseMapper.mapWord(t[0]),keyphraseMapper.mapWord(t[1]),t[2]) for t in keyphraseEdges]

In [33]:
with open("../data/documents/GephiFiles/KeyWords/edges.csv","w") as edgeFile:
    edgeFile.write("Source,Destination,Weight\n")
    for (id1,id2,w) in mappedEdges:
        edgeFile.write(str(id1)+","+str(id2)+","+str(w)+"\n")

In [21]:
selection[0]

TypeError: 'NoneType' object has no attribute '__getitem__'

In [19]:
w = keyphraseTracker.topN()
for wr in w:
    print wr.years

{2006: 96}
{2006: 51}
{2006: 40}
{2006: 22}
{2006: 18}
{2006: 16}
{2006: 11}
{2006: 10}
{2006: 9}
{2006: 8}
{2006: 7}
{2006: 7}
{2006: 7}
{2006: 7}
{2006: 7}
{2006: 7}
{2006: 7}
{2006: 6}
{2006: 6}
{2006: 6}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 5}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 4}
{2006: 3}
