# Data Processing for Topic Model Test

Getting the data from the repository...don't run unless you don't have the data!

!apt-get -y install curl

!curl -o BioMedSent/BioMedSentences.tar.zip http://i.stanford.edu/hazy/opendata/bmc/bmc_full_dddb_20150927_9651bf4a468cefcea30911050c2ca6db.tar.bzip2

http://i.stanford.edu/hazy/opendata/pmc/pmc_dddb_full_20150927_3b20db570e2cb90ab81c5c6f63babc91.tar.bzip2


# Import Data

This section defines the Sentence object used when importing and saving the data. Grab the files in a directory and process a subset of them.


In [1]:
#Import Statements
import string
import os
import pickle
from collections import defaultdict
from multiprocessing import Pool

#Sentence object definition for data import and processing 
class Sentence:
    def __init__(self, document, sentenceNumber, wordList, lemmaList, posList):
        self.document = document
        self.sentenceNumber = sentenceNumber
        self.wordList = wordList
        self.lemmaList = lemmaList
        self.posList = posList
        self.sentence = " ".join([word for word in wordList if word not in string.punctuation])
        self.lemmaSent = " ".join([word for word in lemmaList if word not in string.punctuation])

#Get the files we want to process and put them into a list of lists called sentList
fileList = os.listdir("../PubMed/pmc_dddb_full")
sentList = []
fileList.sort()
for n in range(27, 28):
    f = open("../PubMed/pmc_dddb_full/" + fileList[n], 'r')
    for line in f:
        sentList.append(line.split('\t'))

len(sentList)

1837864


Now that we have all of the sentences in a list of lists grab the first element of each sentence list (the document id) and add that to a docList. Make this docList a set so we have the number of unique documents.


In [2]:
docList = []
for thing in sentList:
    docList.append(thing[0])

len(set(docList))

9981

# Process Data

Define the processSent function for use by the multiprocessing part of the code. This function takes off some of the structure of parts of the data (removing the {,}, and ") and defines the Sentence object with all the appropriate parts.

We then use 14 cores (if available) for the Pool object and apply the processSent function to every sentence.

In [None]:
sentObjList = []
def processSent(item):
    wordList = item[3].replace('"',"").lstrip("{").rstrip("}").split(",")
    wordList = filter(None, wordList)
    posList = item[4].split(",")
    lemmaList = item[6].replace('"',"").lstrip("{").rstrip("}").split(",")
    lemmaList = filter(None, lemmaList)
    return Sentence(item[0], item[1], wordList, lemmaList, posList)

po = Pool(16)
results = [po.apply_async(processSent, args = (sent,)) for sent in sentList]
po.close()
po.join()
output = [p.get() for p in results]
sentObjList = output
sentObjList[7].lemmaSent


Now that the sentences are processed, we need to find which sections these sentences should be atributed. For most of these papers, section headers are one word sentences. We are looking for common section headers and saving the sentence numbers for that section in that document.


In [None]:
headingsDict = defaultdict(dict)

for sent in sentObjList:
    if len(sent.wordList) == 1:
        #print(sent.wordList)
        word = string.upper(sent.wordList[0]).strip()
        if word == 'INTRODUCTION' or word == 'BACKGROUND':
            headingsDict[sent.document]["introduction"] = sent.sentenceNumber
        elif word == 'METHODS':
            headingsDict[sent.document]["methods"] = sent.sentenceNumber
        elif word == 'RESULTS':
            headingsDict[sent.document]["results"] = sent.sentenceNumber
        elif word == 'DISCUSSION':
            headingsDict[sent.document]["discussion"] = sent.sentenceNumber
        elif word == 'CONCLUSION':
            headingsDict[sent.document]["conclusion"] = sent.sentenceNumber
        elif word == 'REFERENCES':
            headingsDict[sent.document]["references"] = sent.sentenceNumber
        

headingsDict.keys()


Now the sentences need to be tagged to their appropriate section and concatenated into one string per section per document.

The sentences are assigned a section by whichever section they are closest to (that is less than their sentence number). For example, if introduction had sentence number 5 and methods had sentence number 25, sentence number 20 would be assigned to introduction.

This is done for each sentence in each document and joined by spaces into a one string per section per document. Finally, only the documents that contain an introduction, discussion, and conclusion are kept and put into the validDocsDict dictionary

In [None]:
documentDict = defaultdict(list)
docPartsDict = defaultdict(lambda : defaultdict(list))
docPartsCombinedDict = defaultdict(dict)

for item in sentObjList:
    documentDict[item.document].append(item)
    
for document in documentDict.keys():
    docSentList = documentDict[document]
    introNum = int(headingsDict[document].get("introduction", -1))
    methoNum = int(headingsDict[document].get("methods", -1))
    resultNum = int(headingsDict[document].get("results", -1))
    discussNum = int(headingsDict[document].get("discussion", -1))
    conclusionNum = int(headingsDict[document].get("conclusion", -1))
    refNum = int(headingsDict[document].get("references", -1))

    for sent in docSentList:
        label = "noSection"
        dist = int(sent.sentenceNumber)
        sentNumber = int(sent.sentenceNumber)
        
        if dist > sentNumber - introNum and sentNumber - introNum > 0:
            label = "introduction"
            dist = sentNumber - introNum
        if dist > sentNumber - methoNum and sentNumber - methoNum > 0:
            label = "methods"
            dist = sentNumber - methoNum
        if dist > sentNumber - resultNum and sentNumber - resultNum > 0:
            label = "results"
            dist = sentNumber - resultNum
        if dist > sentNumber - discussNum and sentNumber - discussNum > 0:
            label = "discussion"
            dist = sentNumber - discussNum
        if dist > sentNumber - conclusionNum and sentNumber - conclusionNum > 0:
            label = "conclusion"
            dist = sentNumber - conclusionNum
        if dist > sentNumber - refNum and sentNumber - refNum > 0:
            label = "references"
            dist = sentNumber - refNum
        if sent.sentence.strip().lower() not in ["introduction", "methods", "results", "discussion", "conclusion", "references"]:
            docPartsDict[document][label].append(sent)
    
    for x in docPartsDict[document].keys():
        docPartsCombinedDict[document][x] = " ".join(y.sentence for y in sorted(docPartsDict[document][x], key=lambda z: z.sentenceNumber))

validDocsDict = defaultdict(dict)

for doc in docPartsCombinedDict.keys():
    tempKeys = docPartsCombinedDict[doc].keys()
    if 'introduction' in tempKeys and 'discussion' in tempKeys and 'conclusion' in tempKeys:
        validDocsDict[doc] = docPartsCombinedDict[doc]

print(str(len(docPartsCombinedDict.keys())))
print(str(len(validDocsDict.keys())))


Take the valid documents in the validDocsDict and output to a pickle file with the key part_docid with the part being introduction, methods, etc. and the docid allowing for document tracking.


In [None]:
outputDict = dict()
for doc in validDocsDict.keys():
    for part in validDocsDict[doc].keys():
        outputDict[part + "_" + doc] = validDocsDict[doc][part]

pickle.dump(outputDict, open("TestDocsPub27.p", "wb"))