In [36]:
import pandas as pd
from nltk.corpus import stopwords
import re

def retrieve_sentences_for_modeling(inFile, fid):
    tsv = pd.read_csv(inFile, sep='\t')
    sentences = []

    sw = stopwords.words('english')
    regex1 = re.compile(r"[\(\)\{\}\[\]\;\.\'\"\,\/\_\*]", re.IGNORECASE)
    regex2 = re.compile(r"\s+", re.IGNORECASE)

    allHits = 0
    hits = 0
    j = 0
    for i, row in tsv.iterrows():
        sid = row['SentenceId']
        codeStr = row['Codes']
        paragraph = row['Paragraph']
        text = row['Sentence Text']
        heading = row['Headings']
        floatingBox = row['FloatingBox?']
        discourse = row['Discourse Type']
        reachData = row['friesEventsTypes']

        j += 1
        if (reachData == reachData):
            allHits += 1

        if (heading != heading):
            heading = ""

        if (floatingBox):
            continue

        if (('implication' not in discourse) and
                    'result' not in discourse):
            continue

        if 'exLink' in codeStr:
            continue

        if ('methods' in str(heading).lower()):
            continue

        r = 'X'
        if (reachData != reachData):
            r = '0'

        if (reachData == reachData):
            hits += 1

        # print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text )

        text = re.sub(regex1, "", text)
        sent = regex2.split(text)
        sent = [w for w in sent if w not in sw and len(w)>0]
        tup = (fid, sid, sent)
        sentences.append(tup)

    return sentences

In [40]:
import os
import sys
from ipywidgets import FloatProgress
from IPython.display import display

inDir = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4/'
sent_tup_list = []

f = FloatProgress(min=0, max=100)
display(f)

sent_list = []
for fn in os.listdir(inDir):
    infile = inDir + "/" + fn
    if (os.path.isfile(infile) and fn.endswith('.tsv')):
        fid = fn.replace(".tsv", "")
        f.value += 1
        for tup in retrieve_sentences_for_modeling(infile, fid):
            sent_list.append(tup[2]);

In [41]:
from gensim import corpora

dictionary = corpora.Dictionary(sent_list)
#dictionary.save('/tmp/deerwester.dict')


In [42]:
corpus = [dictionary.doc2bow(sent) for sent in sent_list]

In [45]:
dictionary.save(inDir + '/sent.dict')
corpora.MmCorpus.serialize(inDir + '/sent.mm', corpus)

In [48]:
mm = corpora.MmCorpus(inDir + '/sent.mm')

In [50]:
print(mm)

MmCorpus(88353 documents, 72629 features, 1287192 non-zero entries)


In [54]:
from gensim.models.ldamodel import LdaModel
lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)

In [56]:
lda.print_topics(100)

[(0,
  u'0.062*"family" + 0.045*"defects" + 0.043*"members" + 0.027*"act" + 0.022*"proteins" + 0.017*"neurite" + 0.016*"subset" + 0.014*"chaperone" + 0.013*"percentage" + 0.013*"pronounced"'),
 (1,
  u'0.058*"even" + 0.042*"detect" + 0.039*"possibility" + 0.027*"partners" + 0.022*"abundance" + 0.022*"proteins" + 0.020*"interaction" + 0.020*"though" + 0.019*"We" + 0.019*"physical"'),
 (2,
  u'0.111*"residues" + 0.076*"Figures" + 0.054*"site" + 0.052*"conserved" + 0.042*"highly" + 0.034*"binding" + 0.022*"The" + 0.018*"critical" + 0.017*"region" + 0.016*"peptide"'),
 (3,
  u'0.058*"subunits" + 0.047*"subunit" + 0.036*"catalytic" + 0.033*"vesicles" + 0.024*"marker" + 0.023*"infected" + 0.018*"myostatin" + 0.017*"added" + 0.017*"module" + 0.016*"latent"'),
 (4,
  u'0.110*"Figure" + 0.081*"S1" + 0.056*"1A" + 0.024*"weak" + 0.021*"2E" + 0.019*"embryos" + 0.018*"potentially" + 0.018*"alanine" + 0.018*"dephosphorylation" + 0.015*"upregulated"'),
 (5,
  u'0.078*"cells" + 0.059*"endogenous" + 0.