In [46]:
import re
import os
import tempfile
from collections import defaultdict
from gensim import corpora
from gensim import models
from gensim.models import TfidfModel

In [47]:
table = open("../data/paperTable.tsv","r")
entries = []
for line in table:
    entries.append(line.split('\t'))
table.close()

In [48]:
# Read in abstract year, title, and text
abstracts = []
titleOfAbstract = []
yearOfAbstract = []
for articles in entries:
    titleOfAbstract.append(articles[0])
    abstracts.append(articles[1]+articles[2]+articles[3])
    yearOfAbstract.append(articles[4][:-1])

In [49]:
# Create a set of frequent words
stopFile = open("../data/stopWords.txt","r")
stopWords = stopFile.read().splitlines()
stopWords.append("\xc2\xa9") #This is the copyright symbol, this shows up in every abstract and should not be apart of the corpus
stopWords.extend(["\u2019","\u03bc"])

stopList = set(stopWords)
# Lowercase each document, split it by white space and filter out stopWords
texts = []
for document in abstracts:
    docwords = []
    for word in document.lower().split():
        word = re.sub(r'[^\w\s]','',word)
        word = re.sub(r'\.+$','',word)
        isNumber = re.compile('^[0-9]+$')
        if isNumber.search(word):
            word = ''
        if word not in stopList and word!='':
            docwords.append(word)
    texts.append(docwords)

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1  

processedCorpus = [[token for token in text if frequency[token] > 5] for text in texts]

In [63]:
#create corpus from processed
tempFolder = tempfile.gettempdir()
dictionary = corpora.Dictionary(processedCorpus)
dictionary.save(os.path.join(tempFolder,'words.dict'))
corpus = [dictionary.doc2bow(text) for text in processedCorpus]
corpora.MmCorpus.serialize(os.path.join(tempFolder, 'words.dict'), corpus)

In [64]:
tfidf = TfidfModel(corpus,id2word=dictionary) 
lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=25)

In [65]:
topNine = []
for doc in tfidf[corpus]:
    temp = {}
    for idd,score in doc:
        temp[str(dictionary[idd])] = score
    temp = sorted(temp.iteritems(), key=lambda (k,v): (v,k),reverse=True)[:9]
    topNine.append([i[0] for i in temp])

In [61]:
outputFile = open("../data/TFIDF/TopNineTerms-25.tsv","w") #Change number for number of topics
for x in xrange(0,len(topNine)):
    writeLine = ""
    for i in topNine[x]:
        writeLine+=i+" "
    outputFile.write(yearOfAbstract[x]+"\t"+titleOfAbstract[x]+"\t"+writeLine+"\t\n")
outputFile.close()