In [None]:
import string
import re
import os
import tempfile
import logging
from datetime import datetime
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from collections import namedtuple
import gensim.parsing.preprocessing as processing
import CustomApi as api
#Start Global Variables and Types
#Number of topics the model should index
numberOfTopics = 125
#Number of passes the model should make
passes = 10
#MyAbstract(title:String, date:datetime.datetime, text:String)
MyAbstract = namedtuple('MyAbstract',["title","date","text"])
#End Global Variables and Types

In [None]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [None]:
# Create a set of frequent words
stopFile = open("../data/stopwords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc","bee","bees","honey","honeybee","honeybees"])
stopList = set(stopWords)
with open("../data/extraStopWords.txt","r") as extraStopFile:
    stopWords.extend(extraStopFile.read().split("\n"))
# Lowercase each document, split it by white space and filter out stopWords
processing.STOPWORDS = stopWords
def removeStops(text):
    return processing.remove_stopwords(text.lower().translate(None, string.punctuation))
# Each abstract has a 'title':String, 'date':datetime.datetime, and 'text':String
abstracts = [MyAbstract._make([art[0],datetime.strptime(art[4][:-1], '%Y-%m-%d'),removeStops((art[1]+art[2]+art[3]))]) for art in entries]
abstracts.sort(key=lambda q: q.date.year)

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for abst in abstracts:
    for token in abst[2].split(" "):
        frequency[token] += 1
processedCorpus = [[token for token in abst[2].split(" ") if frequency[token] > 5] for abst in abstracts]

In [None]:
# Save the dictionary of tokens
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))

In [None]:
# Create general corpus and serialize in order for it to be iterated over
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

The above corpus shows the amount of times every word used in the documents is used in every indevidual document. Every word is represented by a token ID, the list of which can be found in "words.dict"

In [None]:
# Train the model and set number of topics
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=numberOfTopics)

In [None]:
# Sort the most interesting words per topic per document
# This cell does not need to be run if only trying to create Top Nine terms per paper
topicOrganizingFile = open("../data/topicorganization.tsv","w")
for x in xrange(0,len(abstracts)):
    doc = dictionary.doc2bow(abstracts[x].split())
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topicOrganizingFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t")
    for y in xrange(0,min(3,len(docTopics))):
        topicnumber = docTopics[y][0]
        topicOrganizingFile.write(str(lda.show_topic(topicnumber))+"\t")
        #Sorts the word topics in decending order based on their greatest phi value
        for z in xrange(0,len(phiValues)):
            phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
        phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
        curindex=0
        topwords = ""
        for z in xrange(0,3):
            while curindex<len(phiValues) and phiValues[curindex][1][0][0]!=topicnumber:
                curindex+=1
            if(curindex>=len(phiValues)):break
            print len(phiValues)
            print dictionary[phiValues[curindex][0]]
            topwords+=str(dictionary[phiValues[curindex][0]].encode('utf-8').strip())+" "
            curindex+=1
        filter(lambda a:a[0]!=topicnumber,phiValues)
        topicOrganizingFile.write(topwords+"\t")
    topicOrganizingFile.write("\n")
topicOrganizingFile.close()

        

In [None]:
topicWords = []
for i in range(0,numberOfTopics):
    t = lda.get_topic_terms(i,50)
    currentWordList = []
    for x in t:
        word = str(dictionary[x[0]])
        if word not in currentWordList:
            currentWordList.append(word)
    topicWords.append(currentWordList)
topicListFile = open("../data/TopicWords/List-"+str(numberOfTopics)+".txt","w+")
for i in range(0,len(topicWords)):
    topicListFile.write("Topic "+str(i)+":\n")
    for j in topicWords[i]:
        topicListFile.write(j+'\n')
    topicListFile.write('\n')
topicListFile.close()

In [None]:
#Makes the top nine terms for each document

topNineFile = open("../data/Docbow/TopNineTerms-"+str(numberOfTopics)+".tsv","w")
for abstr in abstracts:
    doc = dictionary.doc2bow(abstracts[2].split()) # Convert to bag of words format first
    # Get the topics and words associated with each document
    docTopics, wordTopics, phiValues = lda.get_document_topics(doc, per_word_topics=True)
    topNineFile.write(yearOfAbstract[x]+"\t"+abst+"\t")
    for z in xrange(0,len(phiValues)):
        phiValues[z][1].sort(key=lambda q:q[1],reverse=True)
    phiValues.sort(key=lambda q:q[1][0][1],reverse=True)
    nineWords = ""
    for x in phiValues[:15]:
        nineWords+= dictionary[x[0]] + " "
    topNineFile.write(nineWords.encode('utf-8')+"\n")

In [None]:
#Saves the top 5 topics and top 3 words per topic
with open("../data/Docbow/Top5TopicsTop3WordsTop="+str(numberOfTopics)+".tsv","w") as file:
    docs = [dictionary.doc2bow(a.split()) for a in abstracts] # Convert to bag of words format first
    for d in xrange(0,len(docs)):
        topics = lda.get_document_topics(bow=docs[d])
        topics = sorted(topics,key=lambda k:k[1], reverse=True)
        file.write(yearOfAbstract[d]+":\t")
        for t in topics[0:4]:
            for word in [dictionary[x[0]] for x in lda.get_topic_terms(t[0],topn=3)]:
                file.write(word+"\t")
        file.write("\n")

In [None]:
alist = [(1,1),(2,2),(3,3),(3,3)]
access = lambda x: x[1]
algos.BinarySearch(alist,term=3,start=0,stop=len(alist)-1,accessElement=access)
#alist[0]

In [7]:
abstracts[len(abstracts)-1]

MyAbstract(title='Pharmacogenomic Biomarkers for Improved Drug Therapy\xe2\x80\x94Recent Progress and Future Developments', date=datetime.datetime(2018, 1, 1, 0, 0), text=u'much interindividual variability efficacy adverse reactions due polymorphisms genes encoding proteins involved pharmacokinetics pharmacodynamics immunological responses pharmacogenetic identified multitude genedrug associations resulted genetically guided dosing decisions yield success rate pharmacological rapid technological developments genetic analyses reveal genetic variants importance action much previously thought true personalized prediction requires attention millions rare mutations review evolutionary background genetic polymorphisms drugmetabolizing enzymes provide examples current pharmacogenomic biomarkers give update germline somatic genome biomarkers clinical practice discuss current technology emphasis complex genetic loci review current initiatives validation pharmacogenomic biomarkers scenarios futu