In [1]:
import string
import re
import os
import tempfile
import logging
import csv
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from collections import namedtuple
import gensim.parsing.preprocessing as processing
from os import listdir
import CustomApi as api
from scipy import spatial
#Start Global Variables and Types

#document dir
docs = "../data/documents/"

#Number of topics the model should index
numberOfTopics = 150

#Number of passes the model should make
passes = 75

#Keyphrase tracker
keyphraseTracker = api.KeyWordTracker()

#End Global Variables and Types
"""
Python 2.7 compatible
Commentary and portions of code Created by Andrew Strickland July 2018

Notice:
- A plenty of modifications have been made to this workbook between the dates of 06/10/2018-08/15/2018
- The goal of this program is to lexically analyze research abstracts to form connections and statistical intuitions concerning the subject of
- honeybee colony colapse.

Mentions:
- This project is designed to export files that are interoperable with Gephi, and the gephi companion "GraphBuilder.jar" designed by Andrew Strickland
- For detailed instruction on how to use the gephi companion, see the README.txt file under /GraphBuilder/src/
- For any cell that has my name, Andrew Strickland, in its header, I am available to give reasonable aid to 
- a reasonable misunderstanding. Refer limited questions to apstrick@uncg.edu
"""

'\nPython 2.7 compatible\nCommentary and portions of code Created by Andrew Strickland July 2018\n\nNotice:\n- A plenty of modifications have been made to this workbook between the dates of 06/10/2018-08/15/2018\n- The goal of this program is to lexically analyze research abstracts to form connections and statistical intuitions concerning the subject of\n- honeybee colony colapse.\n\nMentions:\n- This project is designed to export files that are interoperable with Gephi, and the gephi companion "GraphBuilder.jar" designed by Andrew Strickland\n- For detailed instruction on how to use the gephi companion, see the README.txt file under /GraphBuilder/src/\n- For any cell that has my name, Andrew Strickland, in its header, I am available to give reasonable aid to \n- a reasonable misunderstanding. Refer limited questions to apstrick@uncg.edu\n'

In [2]:
"""
Andrew Strickland
Theses are the names of the csv files containing all of our data.
"""
files = listdir(docs+"lt2006/")
files1 = listdir(docs+"gt2006/")
entries = []
"""
Andrew Strickland
Reads each csv file in and adds it to our list of data so long as it has and abstract title and isnt the legend row (aka Authors, Date, Abstract text... the very first row)
"""
for file in files:
    with open(docs+"lt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            #If its not the first row and it has an abstract, read it in.
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
for file in files1:
    with open(docs+"gt2006/"+file,"rb") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if "Authors" not in row[0] and row[15] !='[No abstract available]':
                entries.append(row)
        

In [3]:
"""
Josh Moore
Use synonyms to replace words. These sysnonyms are listed under the csv file cleanSynonym.csv
"""

syns = {}
syn_topics = []
with open("../data/cleanSynonyms.csv","r") as syn_file:
    syn_file.readline()
    for line in syn_file:
        if len(line.split(",")[1])>1:
            syn_topics.append(line.split(",")[1].strip())
            syns[line.split(",")[0].strip()]=line.split(",")[1].strip()
        else:
            syn_topics.append(line.split(",")[0].strip())
            
def syn_replace(string):
    temp = string
    for word in syns:
        temp = temp.replace(word,syns[word])
    return temp

In [4]:
"""
Andrew Strickland
Stems the text of the abstract and removes stop words. The abstracts are then grouped into "selections" where each selection
is just a grouping of no less than 50 abstracts. These selections allow us to run a model over a small selection (usually results in being 1 pub. year)
of abstracts and compare years and perform analysis on the trends and changes between years.
"""
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines() #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend([u"\u2019",u"\u03bc","bee","bees","honey","honeybee","honeybees",u"\xa9",u"\xc2"])
# for asc in range(97,123):
#     stopWords.extend([chr(asc)])
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
stopList = set(stopWords)
processing.STOPWORDS = stopList
ps = PorterStemmer()
def removeStops(text):
    stopsRemoved = processing.remove_stopwords(text.lower().translate(None, string.punctuation))
    words = stopsRemoved.split(" ")
    stemmedWords = []
    for w in words:
        if len(ps.stem(w)) > 2:
            stemmedWords.append(ps.stem(w))
    return ' '.join(stemmedWords)



# Each abstract has a 'title':String, 'date':datetime.datetime, 'text':String, and 'keywords':String
abstracts = [api.MyAbstract._make([art[1],datetime.strptime(art[2], '%Y'),syn_replace(removeStops(art[15])), syn_replace(art[16])]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)
entries = None
# Count word frequencies
selections = []

access = lambda x: x.date.year
lastIndex = 0
bettersel = {}
for i in range(1957,2018):
    index = api.binarySearch(abstracts,i,access)
    if  index != -1:
        selections.append(abstracts[lastIndex:index+1])
        lastIndex = index+1

        

In [5]:
"""
Josh Moore
Creates the topic vectors for the corpus and 
"""

x = ""
for i in selections:
    x+=i[0][2]
d = corpora.Dictionary([x.split(" ")])
common_corpus = [d.doc2bow(x.split(" ")) for text in x]

lda = models.ldamodel.LdaModel(common_corpus, num_topics=numberOfTopics, minimum_probability=0.0)

corpi = {}
count = 0
for x in selections:
    corpi[count] = ""
    for y in x:
        corpi[count]+=y[2]+" "
    count+=1

# Divides the corpus into three groups based on year

split = []
temp = ""
for x in range(len(corpi)-6,len(corpi)):
    temp+=corpi[x]
split.append(temp)

temp = ""
for x in range(len(corpi)-14,len(corpi)-6):
    temp+=corpi[x]
split.append(temp)

temp = ""
for x in range(0,len(corpi)-14):
    temp+=corpi[x]
split.append(temp)

vectors = []
for key in split:
    cur_corpus = [[d.doc2bow(key.split(" "))]]
    for x in lda[cur_corpus[0]]:
        vectors.append(x)

just_nums = [[x[1] for x in vectors[c]]for c in range(0,len(vectors))]

for x in range(0,len(just_nums)-1):
    print (1 - spatial.distance.cosine(just_nums[x], just_nums[x+1]))

0.9745582342147827
0.9734410047531128


In [6]:
"""
Andrew Strickland
"""
from collections import defaultdict
def createCorpus(selection):
    frequency = defaultdict(int)
    for abst in abstracts:
        for token in abst.text.split(" "):
            frequency[token] += 1
    tempFolder = tempfile.gettempdir()
    processedCorpus = [[token for token in abst.text.split(" ") if frequency[token] > 5] for abst in selection]
    dictionary = corpora.Dictionary(processedCorpus)
    dictionary.save(os.path.join(tempFolder,'words.dict'))
    # Create general corpus and serialize in order for it to be iterated over
    corpus = [dictionary.doc2bow(text) for text in processedCorpus]
    corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)
    return api.MyCorpora._make([corpus,dictionary])
    

In [7]:
"""
Andrew Strickland
"""
numTopicsExport = 50
def exportResults(path, sortedVals,wordDict):
    seg = sortedVals[0][0:numTopicsExport]
    #Edges and their weights [(a,b):weight]
    edges = sortedVals[1]
    # set of (a,b)
    exportEdges = set({})
    exportNodes = set({})
    for node in seg:
        exportNodes.add((node["tag"],node["word"],node["occurences"]))
        for e in node["edges"]:
            a = wordDict[e[0]]
            b = wordDict[e[1]]
            if a in seg and b in seg:
                exportEdges.add(e)
                exportNodes.add((a["tag"],a["word"],a["occurences"]))
                exportNodes.add((b["tag"],b["word"],b["occurences"]))
            
    with open(path+"nodes.csv","w") as file:
        file.write("Id,Label,Weight\n")
        for val in exportNodes:
            file.write(str(val[0])+","+val[1].encode('utf8')+','+str(val[2])+'\n')
    with open(path+'edges.csv',"w") as file:
        file.write("Source,Target,Weight\n")
        #We dont want to have duplicate edges written, so we want to make an aedge buffer and then write the buffer
        for e in exportEdges:
            file.write(str(e[0])+","+str(e[1])+","+str(edges[e])+"\n")

In [8]:
"""
Andrew Strickland
"""
import math as m
def getResults(topics):
    mapper = api.WordMapper()
    #Edges [(a,b):weight]
    edges = {}
    def updateEdgeFreq(vec1):
        if vec1[0] == vec1[1]:
            return None
        altVec = [vec1[1],vec1[0]]
        if vec1 in edges.keys():
            edges[vec1] += 1
            return vec1
        elif altVec in edges.keys():
            edges[altVec] += 1
            return altVec
        else:
            edges[vec1] = 1
            return vec1
    occurences = {}
    topics[len(topics)-1]
    for topic in topics:
        words = topic[0]
        #make all the edges for this topic
        wordEdges = []
        wid = mapper.mapWord(words[0][1])
        for t in words:
            edge = updateEdgeFreq((wid,mapper.mapWord(t[1])))
            if edge != None:
                wordEdges.append(edge)
        #add the edges to each word and and update the number of occurences per word
        for w in words:
            word = w[1]
            wid = mapper.mapWord(word)
            if wid not in occurences.keys():
                occurences[wid] = {"prob":w[0],"occurences":1,"edges":wordEdges, "tag" : wid, "word":word}
            else:
                occurences[wid]["occurences"] += 1
                occurences[wid]["prob"] = max(occurences[wid]["prob"], w[0])
    return [occurences,edges]


In [9]:
%%capture
"""
Andrew Strickland
"""
from difflib import SequenceMatcher
size = 0
start = 0
import os
occurences = None
try:
    for i in range(0,len(selections)):
        #keyphraseTracker = api.KeyWordTracker()
        size += len(selections[i])
        if size >=50:
            selection = api.flaten(selections[start:i+1])
            for abstr in selection:
                phrases = [k.strip() for k in  abstr.keywords.split(";")]
                keyPhrases = []
                for phr in phrases:
                    words = [ps.stem(w) if all(ord(c) < 128 for c in w) else None for w in phr.split(" ")]
                    string = ""
                    for w in words:
                        if w != None:
                            string = string +" "+ w
                    string = string.strip()
                    if len(string) > 2:
                        keyPhrases.append(string)
                for kph in keyPhrases:    
                    keyphraseTracker.track(kph,abstr.date.year)
                    keyphraseTracker.registerCoOccurrences(kph,keyPhrases)
            #continue to next section
            corpus = createCorpus(selection)
            path = docs+'GephiFiles/'+str(selection[0].date.year)+'/'
            
            # Make this TFIDF
            tfidf = TfidfModel(corpus.corpus,id2word=corpus.id2word)
            
            
            logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
            
            
#             lda = models.ldamodel.LdaModel(corpus.corpus,id2word=corpus.id2word,num_topics=numberOfTopics,passes=passes)
            lda = models.ldamodel.LdaModel(tfidf[corpus.corpus],id2word=corpus.id2word,num_topics=numberOfTopics,passes=passes)
            
            topics = lda.top_topics(corpus=corpus.corpus,dictionary=corpus.id2word,topn=3)
            
            topics.sort(key=lambda k:k[1],reverse=True)
            occurences = getResults(topics)
            sortedVals = [sorted(occurences[0].values(),key=lambda k:k["occurences"],reverse=True),occurences[1]]
            #####Release some of this memory please and thank you
            selection = None
            #occurences = None
            #####
            exportResults(path,sortedVals,occurences[0])
            size = 0
            start = i+1
except Exception as e:
    #this just sends me an email when the process fails.
    os.system("emailme \"Failed come check it out: "+str(e)+" \"")

In [10]:
"""
Andrew Strickland
Tracks the usage of words over the entire corpus. For detailed functionality see the KeyWordTracker class in 
the CustomApi.py file. Registers the co-occurrence of each phrase with the other phrases in the abstract description.
"""
keyphraseTracker = api.KeyWordTracker()
for abstr in abstracts:
    phrases = [k.strip() for k in  abstr.keywords.split(";")]
    for kph in phrases:    
        keyphraseTracker.track(kph,abstr.date.year)
        keyphraseTracker.registerCoOccurrences(kph,phrases)

In [11]:
"""
Andrew Strickland
Partitions the top 50 keywords in the entire corpus to be exported. These are not new keywords, just those that
appear the most over the course of all the documents.
"""

top50 = keyphraseTracker.topN()
keyphraseNodes = {}
for yo in top50:
    keyphraseNodes[yo.getPhrase()] = yo.sum()
keyphraseEdges = []
for yo in top50:
    for (phr,occur) in yo.cooccurringPhrases.iteritems():
        #rint (phr,occur)
        if phr in keyphraseNodes.keys():
            keyphraseEdges.append((yo.getPhrase(),phr,occur))


In [12]:
print len(keyphraseNodes)

150


In [14]:
"""
Andrew Strickland
This cell splits the keywords from each abstract into three groups. 1974-2004,2005-2012,2013-2018
"""
pre04 = []
post04 = []
post12 = []
notfirst89 = []
notfirst04 = []
notfirst18 = []
yearly = {}
allKeys = {}
for word in keyphraseTracker.words.values():
    allKeys[word.phrase.decode('utf-8')]=word.sum()
    for key in word.years:
        if key <= 2004 and word not in notfirst89:
            notfirst89.append(word)
        elif key <= 2012 and key > 2004 and word not in notfirst04:
            notfirst04.append(word)
        elif key > 2012 and word not in notfirst18:
            notfirst18.append(word)
    if any(a < 2004 for a in word.years):
        pre04.append(word)
    if any(a <= 2012 and a>=2004 for a in word.years):
        post04.append(word)
    if any(a > 2012 for a in word.years):
        post12.append(word)
pre04.sort(key=lambda x:x.sum(),reverse=True)
post04.sort(key=lambda x:x.sum(),reverse=True)
post12.sort(key=lambda x:x.sum(),reverse=True)
notfirst89.sort(key=lambda x:x.sum(),reverse=True)
notfirst04.sort(key=lambda x:x.sum(),reverse=True)
notfirst18.sort(key=lambda x:x.sum(),reverse=True)

In [15]:
with open("../data/KeywordOccurance.tsv","w") as f:
    f.write("Key\tValue\n")
    for key in allKeys:
        f.write(key.encode("utf-16")[2:]+"\t"+str(allKeys[key])+"\n")


In [22]:
"""
Andrew Strickland
This cell builds the node and edge file for the keywords split up into three groups.
1974-2004,2005-2012,2013-2018
"""
keyphraseMapper = api.WordMapper()
def writeNodes(path, arr):
    with open(path,"w") as nodeFile:
        nodeFile.write("Id,Label,Weight\n")
        for w in arr[0:50]:
            id = keyphraseMapper.mapWord(w.phrase)
            nodeFile.write(str(id)+","+w.phrase+","+str(w.sum())+"\n")
writeNodes("../data/documents/GephiFiles/new89/nodes.csv",pre04)
writeNodes("../data/documents/GephiFiles/new04/nodes.csv",post04)
writeNodes("../data/documents/GephiFiles/new18/nodes.csv",post12)
def writeEdges(path,arr):
    keyphraseEdges1 = []
    for yo in arr[0:50]:
        for (phr,occur) in yo.cooccurringPhrases.iteritems():
            edgeWord = keyphraseTracker.words[phr]
            if edgeWord in arr[0:50]:
                id = keyphraseMapper.mapWord(yo.getPhrase())
                id2 = keyphraseMapper.mapWord(phr)
                keyphraseEdges1.append((id,id2,occur))
    with open(path,"w") as edgeFile:
        edgeFile.write("Source,Destination,Weight\n")
        for (id1,id2,w) in keyphraseEdges1:
            edgeFile.write(str(id1)+","+str(id2)+","+str(w)+"\n")
writeEdges("../data/documents/GephiFiles/new89/edges.csv",pre04)
writeEdges("../data/documents/GephiFiles/new04/edges.csv",post04)
writeEdges("../data/documents/GephiFiles/new18/edges.csv",post12)  

In [23]:
"""
Andrew Strickland
"""
with open("../data/documents/GephiFiles/KeyWords/nodes.csv","w") as nodesFile:
    nodesFile.write("Id,Label,Weight\n")
    for (phr,occur) in keyphraseNodes.iteritems():
        id = keyphraseMapper.mapWord(phr)
        nodesFile.write(str(id)+","+phr+","+str(occur)+"\n")
mappedEdges = [(keyphraseMapper.mapWord(t[0]),keyphraseMapper.mapWord(t[1]),t[2]) for t in keyphraseEdges]

In [24]:
"""
Josh Moore
This function returns the edges for a given set of topics
"""
def give_Edges(arr):
    translate89 = {}
    for phrase in arr[:150]:
        for p in phrase.cooccurringPhrases.iteritems():
            translate89[str(keyphraseMapper.mapWord(p[0]))]=p[0]
    #         print p[0]+" "+str(keyphraseMapper.mapWord(p[0]))
    edges89 = []
    for node in arr[:150]:
        for (phr,occur) in node.cooccurringPhrases.iteritems():
                edgeWord = keyphraseTracker.words[phr]
                if edgeWord in arr[:150]:
                    id = node.getPhrase()
                    id2 = translate89[str(keyphraseMapper.mapWord(phr))]
                    edges89.append((id,id2,occur))
    return edges89

In [78]:
"""
Josh Moore
This returns the network figures that connects nodes based on how many appereances the keywords make in the corpus
"""

import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
plt.figure(figsize=(15,15))
current_set = post12

nodes = []
weights = []
f = open("../data/post12top10.csv","w")
count = 0
while len(nodes)<11:
    phrase = current_set[count]
    if not any(x.lower() in phrase.getPhrase().lower() for x in syns):
        if u'\u03b1' not in phrase.phrase.decode('utf-8').strip():
            f.write(phrase.phrase.decode('utf-8').strip()+","+str(phrase.sum())+"\n")
            nodes.append(phrase.phrase.decode('utf-8').strip())
            weights.append(phrase.sum()*10)
    count+=1
f.close()
g = nx.Graph()

g.add_nodes_from(nodes)
pos = nx.spring_layout(g)
temp = give_Edges(current_set)
toDel = []
for x in range(0,len(temp)):
#     print temp[x]
    if not temp[x][0] in nodes or not temp[x][1] in nodes :
        toDel.append(x)
for x in reversed(toDel):
    del temp[x]
temp.sort(key=lambda x: x[2], reverse=True)

g.add_weighted_edges_from(temp)

d = nx.degree(g)
labels = {x:x for x in nodes}

nx.draw_networkx_labels(g,pos,labels,font_size=14)
plt.axis('off')
nx.draw(g,pos,node_size=weights,edge_color='blue')
plt.close()

In [71]:
"""
Josh Moore
Creates files for first occurence per year figures
"""
arr = sorted(keyphraseTracker.words.values(),key=lambda k:k.sum(),reverse=True)
first_year = {}
for a in arr:
    if a.firstOccurence() not in first_year:
        first_year[a.firstOccurence()] = []
    first_year[a.firstOccurence()].append(a)
for x in first_year:
    first_year[x].sort(key=lambda k:k.sum())
for x in first_year:
    with open("../figures/"+str(x)+"firstOccurence.csv","w") as f:
        for y in first_year[x][:25]:
            f.write(y.getPhrase()+","+str(y.sum())+"\n")