In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cross_validation import train_test_split

import numpy as np

import pickle

validDocsDict = pickle.load(open("TestDocs.p", "rb"))

In [2]:
n_samples = len(validDocsDict.keys())
n_features = 1000
n_topics = 2
n_top_words = 30


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [3]:
print("Loading dataset...")
t0 = time()
documents = []

labels = []
concLengthTotal = 0
discLengthTotal = 0
concCount = 0
discCount = 0

for k in validDocsDict.keys():
    if k.startswith("conclusion"):
        labels.append("conclusion")
        documents.append(validDocsDict[k])
        concCount += 1
        concLengthTotal += len(validDocsDict[k].split(' '))
    elif k.startswith("discussion"):
        labels.append("discussion")
        documents.append(validDocsDict[k])
        discCount += 1
        discLengthTotal += len(validDocsDict[k].split(' '))

print(len(documents))
print(concLengthTotal * 1.0/ concCount)
print(discLengthTotal * 1.0/ discCount)

train, test, labelsTrain, labelsTest = train_test_split(documents, labels, test_size = 0.1)

Loading dataset...
6682
624.902424424
1211.72852439


In [4]:
trainSetOne = []
trainSetTwo = []

for x in range(len(train)):
    if labelsTrain[x] == "conclusion":
        trainSetOne.append(train[x])
    else:
        trainSetTwo.append(train[x])

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = TfidfVectorizer(max_df=0.95, norm = 'l1', min_df=2, max_features=n_features, stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(train)

tfSetOne = tf_vectorizer.transform(trainSetOne)
tfSetTwo = tf_vectorizer.transform(trainSetTwo)
tfTest = tf_vectorizer.transform(test)
test = tfTest
train = tf
trainSetOne = tfSetOne
trainSetTwo = tfSetTwo

print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 8.302s.




In [5]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=19848 and n_features=1000...
done in 44.803s.

Topics in LDA model:
Topic #0:
patients study health 1016 care authors manuscript risk treatment data clinical disease research women patient use medical background children history competing pre interests publication quality analysis hiv pain cancer design
Topic #1:
cells expression gene cell genes methods background protein results human different cancer used species present studies activity proteins model levels using genetic genome specific method dna function role analysis data


In [6]:
results = lda.transform(test)
totalConTop1 = 0
totalConTop2 = 0
totalDisTop1 = 0
totalDisTop2 = 0
for x in range(len(results)):
    val1 = results[x][0]
    val2 = results[x][1]
    total = val1 + val2
    print(str(labelsTest[x]) + " " + str(val1/total) + " " + str(val2/total))
    if val1 > val2:
        if labelsTest[x] == "conclusion":
            totalConTop1 += 1
        else:
            totalDisTop1 += 1
    else:
        if labelsTest[x] == "conclusion":
            totalConTop2 += 1
        else:
            totalDisTop2 += 1

discussion 0.326395942625 0.673604057375
conclusion 0.684622801005 0.315377198995
conclusion 0.65381595118 0.34618404882
discussion 0.323446222045 0.676553777955
conclusion 0.706387048019 0.293612951981
conclusion 0.294391088966 0.705608911034
discussion 0.538538762125 0.461461237875
conclusion 0.499644632783 0.500355367217
conclusion 0.601575216305 0.398424783695
discussion 0.295237269104 0.704762730896
conclusion 0.34005866136 0.65994133864
discussion 0.339397795028 0.660602204972
conclusion 0.425152582567 0.574847417433
discussion 0.301966959974 0.698033040026
discussion 0.449664540707 0.550335459293
discussion 0.627014892362 0.372985107638
discussion 0.470325621279 0.529674378721
conclusion 0.706079059118 0.293920940882
discussion 0.356195333636 0.643804666364
discussion 0.562989362244 0.437010637756
discussion 0.492675848807 0.507324151193
discussion 0.594069365292 0.405930634708
discussion 0.530475627641 0.469524372359
conclusion 0.438162559084 0.561837440916
conclusion 0.6627853

In [7]:
print("Total Conclusion Topic One: " + str(totalConTop1))
print("Total Conclusion Topic Two: " + str(totalConTop2))
print("Total Introduction Topic One: " + str(totalDisTop1))
print("Total Introduction Topic Two: " + str(totalDisTop2))

Total Conclusion Topic One: 199
Total Conclusion Topic Two: 152
Total Introduction Topic One: 127
Total Introduction Topic Two: 191


In [8]:
lda.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 50.0,
 'max_doc_update_iter': 100,
 'max_iter': 100,
 'mean_change_tol': 0.001,
 'n_jobs': 1,
 'n_topics': 2,
 'perp_tol': 0.1,
 'random_state': 0,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}

In [9]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(train.toarray(), labelsTrain)

classResults = classifier.predict(test.toarray())

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.941704035874


In [10]:
from sklearn.svm import SVC

classifier = SVC()

classifier.fit(train, labelsTrain)

classResults = classifier.predict(test)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.47533632287


In [11]:
ldaSet1 = LatentDirichletAllocation(n_topics=20, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
ldaSet2 = LatentDirichletAllocation(n_topics=20, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

In [12]:
ldaSet1.fit(trainSetOne)
print_top_words(ldaSet1, tf_feature_names, n_top_words)

Topic #0:
plant 1016 family alcohol data identified gene virus methods study affected health families manuscript formation analyses intern 1002 university patients current strong trial sequencing rna 01 impact interpretation author populations
Topic #1:
background health methods 1016 study manuscript authors data cancer patients gene analysis cell expression genes care cells competing interests research human publication results history clinical risk protein pre treatment disease
Topic #2:
observation world background group manuscript 1016 test authors study patients risk stress factors independent likely implications years male methods patient foundation 1007 use standard participated genes negative injury reference issues
Topic #3:
background functional metastasis influenza site genetic methods ability associated treatment analysis growth case induced pressure phase increased 2006 primary performed changes drug declare structure parameters skin resulting factor quality management
Top

In [13]:
ldaSet2.fit(trainSetTwo)
print_top_words(ldaSet2, tf_feature_names, n_top_words)

Topic #0:
patients populations infected study cells hiv observed activation levels associated trial muscle mass strong current isolates treatment clinical previously virus high formation pathway role influenza compared years explained contribute significantly
Topic #1:
patients study cells studies expression genes gene data cell reported results health et al treatment used care different risk high higher cancer use women patient protein associated based levels observed
Topic #2:
patients patient study cells treatment hiv infection studies diagnosis years associated al clinical likely follow et months prevalence children risk reported use negative symptoms showed group population 23 independent used
Topic #3:
essential snps genes quality performance bacterial data study context based expression provide gene markers non parameters associated snp selection risk environment test species figure factor methods conclusion analysis different et
Topic #4:
genes region patients expression treatm

In [14]:
results1 = ldaSet1.transform(train)
results2 = ldaSet2.transform(train)

resultsTest1 = ldaSet1.transform(test)
resultsTest2 = ldaSet2.transform(test)

In [15]:
results = np.hstack((results1, results2))
resultsTest = np.hstack((resultsTest1, resultsTest2))

In [16]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.774289985052


In [17]:
from sklearn.svm import SVC

classifier = SVC()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.47533632287


In [18]:
for x in range(len(results)):
    total = 0
    for y in range(len(results[x])):
        total += results[x][y]
    for y in range(len(results[x])):
        results[x][y] = results[x][y]/total
        
for x in range(len(resultsTest)):
    total = 0
    for y in range(len(resultsTest[x])):
        total += resultsTest[x][y]
    for y in range(len(resultsTest[x])):
        resultsTest[x][y] = resultsTest[x][y]/total
        


In [19]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.707025411061


In [20]:
from sklearn.svm import SVC

classifier = SVC()

classifier.fit(results, labelsTrain)

classResults = classifier.predict(resultsTest)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTest[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.47533632287
