In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cross_validation import train_test_split

import numpy as np

import pickle

validDocsDict = pickle.load(open("TestDocs.p", "rb"))
validDocsDict.update(pickle.load(open("TestDocs2.p", "rb")))
validDocsDict.update(pickle.load(open("TestDocs3.p", "rb")))
validDocsDict.update(pickle.load(open("TestDocs4.p", "rb")))
validDocsDict.update(pickle.load(open("TestDocs5.p", "rb")))
validDocsDict.update(pickle.load(open("TestDocs6.p", "rb")))
validDocsDict.update(pickle.load(open("TestDocs7.p", "rb")))

In [2]:
n_samples = len(validDocsDict.keys())
n_features = 1000
n_topics = 2
n_top_words = 30


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [3]:
print("Loading dataset...")
t0 = time()
documents = []

labels = []
concLengthTotal = 0
discLengthTotal = 0
concCount = 0
discCount = 0

for k in validDocsDict.keys():
    if k.startswith("conclusion"):
        labels.append("conclusion")
        documents.append(validDocsDict[k])
        concCount += 1
        concLengthTotal += len(validDocsDict[k].split(' '))
    elif k.startswith("discussion"):
        labels.append("discussion")
        documents.append(validDocsDict[k])
        discCount += 1
        discLengthTotal += len(validDocsDict[k].split(' '))

print(len(documents))
print(concLengthTotal * 1.0/ concCount)
print(discLengthTotal * 1.0/ discCount)

train, test, labelsTrain, labelsTest = train_test_split(documents, labels, test_size = 0.1)

Loading dataset...
53034
621.583361617
1197.39683976


In [4]:
trainSetOne = []
trainSetTwo = []

for x in range(len(train)):
    if labelsTrain[x] == "conclusion":
        trainSetOne.append(train[x])
    else:
        trainSetTwo.append(train[x])

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = TfidfVectorizer(max_df=0.95, norm = 'l1', min_df=2, max_features=n_features, stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(train)

tfSetOne = tf_vectorizer.transform(trainSetOne)
tfSetTwo = tf_vectorizer.transform(trainSetTwo)
tfTest = tf_vectorizer.transform(test)
test = tfTest
train = tf
trainSetOne = tfSetOne
trainSetTwo = tfSetTwo

print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 65.915s.




In [5]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=157526 and n_features=1000...
done in 316.473s.

Topics in LDA model:
Topic #0:
patients health study care 1016 authors manuscript risk treatment clinical data disease research use women patient hiv medical children competing history pre cancer interests analysis publication population design quality pain
Topic #1:
background expression gene cells genes cell protein results different cancer human activity studies used species model levels proteins specific present genetic method using dna data genome role number function high


In [6]:
results = lda.transform(test)
totalConTop1 = 0
totalConTop2 = 0
totalDisTop1 = 0
totalDisTop2 = 0
for x in range(len(results)):
    val1 = results[x][0]
    val2 = results[x][1]
    total = val1 + val2
    print(str(labelsTest[x]) + " " + str(val1/total) + " " + str(val2/total))
    if val1 > val2:
        if labelsTest[x] == "conclusion":
            totalConTop1 += 1
        else:
            totalDisTop1 += 1
    else:
        if labelsTest[x] == "conclusion":
            totalConTop2 += 1
        else:
            totalDisTop2 += 1

discussion 0.345879666059 0.654120333941
conclusion 0.683724930607 0.316275069393
conclusion 0.2949000117 0.7050999883
discussion 0.575322906696 0.424677093304
discussion 0.363541076149 0.636458923851
discussion 0.618966171443 0.381033828557
discussion 0.318450347091 0.681549652909
conclusion 0.285827150464 0.714172849536
discussion 0.283690746327 0.716309253673
conclusion 0.407013704826 0.592986295174
conclusion 0.521442383229 0.478557616771
conclusion 0.597550558757 0.402449441243
conclusion 0.548869409215 0.451130590785
conclusion 0.588949131918 0.411050868082
discussion 0.537922188727 0.462077811273
discussion 0.301215311111 0.698784688889
discussion 0.31860252497 0.68139747503
discussion 0.275836653651 0.724163346349
conclusion 0.561136939778 0.438863060222
conclusion 0.678292227867 0.321707772133
discussion 0.606450472471 0.393549527529
conclusion 0.401235270072 0.598764729928
discussion 0.290360637814 0.709639362186
discussion 0.304285165696 0.695714834304
conclusion 0.653294118

In [7]:
print("Total Conclusion Topic One: " + str(totalConTop1))
print("Total Conclusion Topic Two: " + str(totalConTop2))
print("Total Discussion Topic One: " + str(totalDisTop1))
print("Total Discussion Topic Two: " + str(totalDisTop2))

Total Conclusion Topic One: 1580
Total Conclusion Topic Two: 1113
Total Discussion Topic One: 1076
Total Discussion Topic Two: 1535


In [8]:
lda.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 50.0,
 'max_doc_update_iter': 100,
 'max_iter': 100,
 'mean_change_tol': 0.001,
 'n_jobs': 1,
 'n_topics': 2,
 'perp_tol': 0.1,
 'random_state': 0,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}

In [9]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(train.toarray(), labelsTrain)

classResults = classifier.predict(test.toarray())

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.925527903469


In [21]:
from sklearn.naive_bayes import BernoulliNB

classifier = BernoulliNB()

classifier.fit(train.toarray(), labelsTrain)

classResults = classifier.predict(test.toarray())

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.946644042232


In [10]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()

classifier.fit(train, labelsTrain)

classResults = classifier.predict(test)
numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.740196078431


In [11]:
ldaSet1 = LatentDirichletAllocation(n_topics=20, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
ldaSet2 = LatentDirichletAllocation(n_topics=20, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

In [12]:
ldaSet1.fit(trainSetOne)
print_top_words(ldaSet1, tf_feature_names, n_top_words)

Topic #0:
physician gps income questionnaire conserved adolescents emergency nurses physicians fig students nursing rural healthcare perceived transcriptional illness urban p53 medication workers microarray intern complications jama policy hospitals elderly mrna signaling
Topic #1:
background gene expression genes cell cells cancer protein human data genome study results genetic analysis species proteins dna suggest novel method sequence 1016 role specific different breast new molecular using
Topic #2:
physician gps income questionnaire conserved adolescents emergency nurses physicians fig students nursing rural healthcare perceived transcriptional illness urban p53 medication workers microarray intern complications jama policy hospitals elderly mrna signaling
Topic #3:
physician gps income questionnaire conserved adolescents emergency nurses physicians fig students nursing rural healthcare perceived transcriptional illness urban p53 medication workers microarray intern complications j

In [13]:
ldaSet2.fit(trainSetTwo)
print_top_words(ldaSet2, tf_feature_names, n_top_words)

Topic #0:
income gps nursing transcriptional questionnaire physician emergency conserved jama students nurses intern policy staff rural healthcare physicians mrna p53 school adolescents 1001 urban s0140 6736 education signaling promoter services economic
Topic #1:
cells expression genes gene cell protein study proteins cancer species different data dna activity figure observed results levels sequence studies binding shown tumor used et present al human genome analysis
Topic #2:
income gps nursing transcriptional questionnaire physician emergency conserved jama students nurses intern policy staff rural healthcare physicians mrna p53 school adolescents 1001 urban s0140 6736 education signaling promoter services economic
Topic #3:
income gps nursing transcriptional questionnaire physician emergency conserved jama students nurses intern policy staff rural healthcare physicians mrna p53 school adolescents 1001 urban s0140 6736 education signaling promoter services economic
Topic #4:
income 

In [14]:
results1 = ldaSet1.transform(train)
results2 = ldaSet2.transform(train)

resultsTest1 = ldaSet1.transform(test)
resultsTest2 = ldaSet2.transform(test)

In [15]:
results = np.hstack((results1, results2))
resultsTest = np.hstack((resultsTest1, resultsTest2))

In [16]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.547134238311


In [17]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.648944193062


In [18]:
for x in range(len(results)):
    total = 0
    for y in range(len(results[x])):
        total += results[x][y]
    for y in range(len(results[x])):
        results[x][y] = results[x][y]/total
        
for x in range(len(resultsTest)):
    total = 0
    for y in range(len(resultsTest[x])):
        total += resultsTest[x][y]
    for y in range(len(resultsTest[x])):
        resultsTest[x][y] = resultsTest[x][y]/total
        


In [19]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.547134238311


In [20]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.648755656109
