In [1]:
import string
import re
import os
import tempfile
import logging
import csv
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from collections import namedtuple
import gensim.parsing.preprocessing as processing
from os import listdir
import CustomApi as api
#Start Global Variables and Types

#document dir
docs = "../data/documents/"

#Number of topics the model should index
numberOfTopics = 225

#Number of passes the model should make
passes = 75

#Keyphrase tracker
keyphraseTracker = api.KeyWordTracker()

#End Global Variables and Types
"""
Python 2.7 compatible
Commentary and portions of code Created by Andrew Strickland July 2018

Notice:
- A plenty of modifications have been made to this workbook between the dates of 06/10/2018-08/15/2018
- The goal of this program is to lexically analyze research abstracts to form connections and statistical intuitions concerning the subject of
- honeybee colony colapse.

Mentions:
- This project is designed to export files that are interoperable with Gephi, and the gephi companion "GraphBuilder.jar" designed by Andrew Strickland
- For detailed instruction on how to use the gephi companion, see the README.txt file under /GraphBuilder/src/
- For any cell that has my name, Andrew Strickland, in its header, I am available to give reasonable aid to 
- a reasonable misunderstanding. Refer limited questions to apstrick@uncg.edu
"""



In [2]:
"""
Andrew Strickland
Theses are the names of the csv files containing all of our data.
"""
files = listdir(docs+"lt2006/")
files1 = listdir(docs+"gt2006/")
entries = []

In [3]:
"""
Andrew Strickland
Reads each csv file in and adds it to our list of data so long as it has and abstract title and isnt the legend row (aka Authors, Date, Abstract text... the very first row)
"""
for file in files:
    with open(docs+"lt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            #If its not the first row and it has an abstract, read it in.
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
for file in files1:
    with open(docs+"gt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
        

In [5]:
"""
Andrew Strickland
Stems the text of the abstract and removes stop words. The abstracts are then grouped into "selections" where each selection
is just a grouping of no less than 50 abstracts. These selections allow us to run a model over a small selection (usually results in being 1 pub. year)
of abstracts and compare years and perform analysis on the trends and changes between years.
"""
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines() #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend([u"\u2019",u"\u03bc","bee","bees","honey","honeybee","honeybees",u"\xa9",u"\xc2"])
# for asc in range(97,123):
#     stopWords.extend([chr(asc)])
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
stopList = set(stopWords)
processing.STOPWORDS = stopList
ps = PorterStemmer()
def removeStops(text):
    stopsRemoved = processing.remove_stopwords(text.lower().translate(None, string.punctuation))
    words = stopsRemoved.split(" ")
    stemmedWords = []
    for w in words:
        if len(ps.stem(w)) > 2:
            stemmedWords.append(ps.stem(w))
    return ' '.join(stemmedWords)
# Each abstract has a 'title':String, 'date':datetime.datetime, 'text':String, and 'keywords':String
abstracts = [api.MyAbstract._make([art[1],datetime.strptime(art[2], '%Y'),removeStops(art[15]), art[16]]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)
entries = None
# Count word frequencies
selections = []

access = lambda x: x.date.year
lastIndex = 0
for i in range(1957,2018):
    index = api.binarySearch(abstracts,i,access)
    if  index != -1:
        selections.append(abstracts[lastIndex:index+1])
        lastIndex = index+1
        

In [55]:
"""
Andrew Strickland
"""
from collections import defaultdict
def createCorpus(selection):
    frequency = defaultdict(int)
    for abst in abstracts:
        for token in abst.text.split(" "):
            frequency[token] += 1
    tempFolder = tempfile.gettempdir()
    processedCorpus = [[token for token in abst.text.split(" ") if frequency[token] > 5] for abst in selection]
    dictionary = corpora.Dictionary(processedCorpus)
    dictionary.save(os.path.join(tempFolder,'words.dict'))
    # Create general corpus and serialize in order for it to be iterated over
    corpus = [dictionary.doc2bow(text) for text in processedCorpus]
    corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    return api.MyCorpora._make([corpus,dictionary])
    

In [56]:
"""
Andrew Strickland
"""
numTopicsExport = 50
def exportResults(path, sortedVals,wordDict):
    seg = sortedVals[0][0:numTopicsExport]
    #Edges and their weights [(a,b):weight]
    edges = sortedVals[1]
    # set of (a,b)
    exportEdges = set({})
    exportNodes = set({})
    for node in seg:
        exportNodes.add((node["tag"],node["word"],node["occurences"]))
        for e in node["edges"]:
            a = wordDict[e[0]]
            b = wordDict[e[1]]
            if a in seg and b in seg:
                exportEdges.add(e)
                exportNodes.add((a["tag"],a["word"],a["occurences"]))
                exportNodes.add((b["tag"],b["word"],b["occurences"]))
            
    with open(path+"nodes.csv","w") as file:
        file.write("Id,Label,Weight\n")
        for val in exportNodes:
            file.write(str(val[0])+","+val[1].encode('utf8')+','+str(val[2])+'\n')
    with open(path+'edges.csv',"w") as file:
        file.write("Source,Target,Weight\n")
        #We dont want to have duplicate edges written, so we want to make an aedge buffer and then write the buffer
        for e in exportEdges:
            file.write(str(e[0])+","+str(e[1])+","+str(edges[e])+"\n")

In [57]:
"""
Andrew Strickland
"""
import math as m
def getResults(topics):
    mapper = api.WordMapper()
    #Edges [(a,b):weight]
    edges = {}
    def updateEdgeFreq(vec1):
        if vec1[0] == vec1[1]:
            return None
        altVec = [vec1[1],vec1[0]]
        if vec1 in edges.keys():
            edges[vec1] += 1
            return vec1
        elif altVec in edges.keys():
            edges[altVec] += 1
            return altVec
        else:
            edges[vec1] = 1
            return vec1
    occurences = {}
    topics[len(topics)-1]
    for topic in topics:
        words = topic[0]
        #make all the edges for this topic
        wordEdges = []
        wid = mapper.mapWord(words[0][1])
        for t in words:
            edge = updateEdgeFreq((wid,mapper.mapWord(t[1])))
            if edge != None:
                wordEdges.append(edge)
        #add the edges to each word and and update the number of occurences per word
        for w in words:
            word = w[1]
            wid = mapper.mapWord(word)
            if wid not in occurences.keys():
                occurences[wid] = {"prob":w[0],"occurences":1,"edges":wordEdges, "tag" : wid, "word":word}
            else:
                occurences[wid]["occurences"] += 1
                occurences[wid]["prob"] = max(occurences[wid]["prob"], w[0])
    return [occurences,edges]


In [58]:
%%capture
"""
Andrew Strickland
"""
from difflib import SequenceMatcher
size = 0
start = 0
import os
occurences = None
try:
    for i in range(0,len(selections)):
        #keyphraseTracker = api.KeyWordTracker()
        size += len(selections[i])
        if size >=50:
            selection = api.flaten(selections[start:i+1])
            for abstr in selection:
                phrases = [k.strip() for k in  abstr.keywords.split(";")]
                keyPhrases = []
                for phr in phrases:
                    words = [ps.stem(w) if all(ord(c) < 128 for c in w) else None for w in phr.split(" ")]
                    string = ""
                    for w in words:
                        if w != None:
                            string = string +" "+ w
                    string = string.strip()
                    if len(string) > 2:
                        keyPhrases.append(string)
                for kph in keyPhrases:    
                    keyphraseTracker.track(kph,abstr.date.year)
                    keyphraseTracker.registerCoOccurrences(kph,keyPhrases)
            #continue to next section
            corpus = createCorpus(selection)
            path = docs+'GephiFiles/'+str(selection[0].date.year)+'/'
            try:
                os.makedirs(path)
                print "made dirs"
            except:
                #Already exists
                pass
            logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
            lda = models.ldamodel.LdaModel(corpus.corpus,id2word=corpus.id2word,num_topics=numberOfTopics,passes=passes)
            topics = lda.top_topics(corpus=corpus.corpus,dictionary=corpus.id2word,topn=3)
            
            topics.sort(key=lambda k:k[1],reverse=True)
            occurences = getResults(topics)
            sortedVals = [sorted(occurences[0].values(),key=lambda k:k["occurences"],reverse=True),occurences[1]]
            #####Release some of this memory please and thank you
            selection = None
            #occurences = None
            #####
            exportResults(path,sortedVals,occurences[0])
            size = 0
            start = i+1
    #this just sends me an email when the process is done!
    os.system("emailme \"Finished successfully!\"")
except Exception as e:
    #this just sends me an email when the process fails.
    os.system("emailme \"Failed come check it out: "+str(e)+" \"")

In [8]:
"""
Andrew Strickland
Tracks the usage of words over the entire corpus. For detailed functionality see the KeyWordTracker class in 
the CustomApi.py file. Registers the co-occurrence of each phrase with the other phrases in the abstract description.
"""
keyphraseTracker = api.KeyWordTracker()
for abstr in abstracts:
    phrases = [k.strip() for k in  abstr.keywords.split(";")]
    for kph in phrases:    
        keyphraseTracker.track(kph,abstr.date.year)
        keyphraseTracker.registerCoOccurrences(kph,phrases)

In [59]:
"""
Andrew Strickland
Partitions the top 50 keywords in the entire corpus to be exported. These are not new keywords, just those that
appear the most over the course of all the documents.
"""

top50 = keyphraseTracker.topN()
keyphraseNodes = {}
for yo in top50:
    keyphraseNodes[yo.getPhrase()] = yo.sum()
keyphraseEdges = []
for yo in top50:
    for (phr,occur) in yo.cooccurringPhrases.iteritems():
        #rint (phr,occur)
        if phr in keyphraseNodes.keys():
            keyphraseEdges.append((yo.getPhrase(),phr,occur))


In [61]:
with open("../data/documents/GephiFiles/KeyWords/edges.csv","w") as edgeFile:
    edgeFile.write("Source,Destination,Weight\n")
    for (id1,id2,w) in mappedEdges:
        edgeFile.write(str(id1)+","+str(id2)+","+str(w)+"\n")

In [14]:
"""
Andrew Strickland
This cell splits the keywords from each abstract into three groups. 1974-1989,1990-2004,2005-2018
"""
preq89 = []
preq04 = []
post04 = []
for word in keyphraseTracker.words.values():
    if word.firstOccurence() <= 1989:
        preq89.append(word)
    elif word.firstOccurence() > 1989 and word.firstOccurence() <= 2004:
        preq04.append(word)
    else:
        post04.append(word)
preq89.sort(key=lambda x:x.sum(),reverse=True)
preq04.sort(key=lambda x:x.sum(),reverse=True)
post04.sort(key=lambda x:x.sum(),reverse=True)

In [17]:
len(post04)

13997

In [19]:
"""
Andrew Strickland
This cell builds the node and edge file for the keywords split up into three groups.
1974-1989, 1990-2004, 2005-2018
"""
keyphraseMapper = api.WordMapper()
def writeNodes(path, arr):
    with open(path,"w") as nodeFile:
        nodeFile.write("Id,Label,Weight\n")
        for w in arr[0:50]:
            id = keyphraseMapper.mapWord(w.phrase)
            nodeFile.write(str(id)+","+w.phrase+","+str(w.sum())+"\n")

writeNodes("../data/documents/GephiFiles/new89/nodes.csv",preq89)
writeNodes("../data/documents/GephiFiles/new04/nodes.csv",preq04)
writeNodes("../data/documents/GephiFiles/new18/nodes.csv",post04)
def writeEdges(path,arr):
    keyphraseEdges1 = []
    for yo in arr[0:50]:
        for (phr,occur) in yo.cooccurringPhrases.iteritems():
            edgeWord = keyphraseTracker.words[phr]
            if edgeWord in arr[0:50]:
                id = keyphraseMapper.mapWord(yo.getPhrase())
                id2 = keyphraseMapper.mapWord(phr)
                keyphraseEdges1.append((id,id2,occur))
    with open(path,"w") as edgeFile:
        edgeFile.write("Source,Destination,Weight\n")
        for (id1,id2,w) in keyphraseEdges1:
            edgeFile.write(str(id1)+","+str(id2)+","+str(w)+"\n")
writeEdges("../data/documents/GephiFiles/new89/edges.csv",preq89)
writeEdges("../data/documents/GephiFiles/new04/edges.csv",preq04)
writeEdges("../data/documents/GephiFiles/new18/edges.csv",post04)  

In [60]:
"""
Andrew Strickland
"""
with open("../data/documents/GephiFiles/KeyWords/nodes.csv","w") as nodesFile:
    nodesFile.write("Id,Label,Weight\n")
    for (phr,occur) in keyphraseNodes.iteritems():
        id = keyphraseMapper.mapWord(phr)
        nodesFile.write(str(id)+","+phr+","+str(occur)+"\n")
mappedEdges = [(keyphraseMapper.mapWord(t[0]),keyphraseMapper.mapWord(t[1]),t[2]) for t in keyphraseEdges]