In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cross_validation import train_test_split

import numpy as np

import pickle

validDocsDict = pickle.load(open("TestDocs.p", "rb"))

In [2]:
n_samples = len(validDocsDict.keys())
n_features = 10000
n_topics = 2
n_top_words = 30


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [3]:
print("Loading dataset...")
t0 = time()
documents = []
labels = []

for k in validDocsDict.keys():
    if k.startswith("conclusion"):
        labels.append("conclusion")
        documents.append(validDocsDict[k])
    elif k.startswith("introduction"):
        labels.append("introduction")
        documents.append(validDocsDict[k])

print(len(documents))        
        
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(documents)

train, test, labelsTrain, labelsTest = train_test_split(tf, labels, test_size = 0.1)

print("done in %0.3fs." % (time() - t0))
test

Loading dataset...
6682
Extracting tf features for LDA...
done in 3.684s.




<669x10000 sparse matrix of type '<type 'numpy.int64'>'
	with 128139 stored elements in Compressed Sparse Row format>

In [4]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

t0 = time()
lda.fit(train)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=19848 and n_features=10000...
done in 171.820s.

Topics in LDA model:
Topic #0:
health study patients care 1016 data authors risk manuscript research clinical treatment disease analysis medical 10 use history patient based studies pre publication competing interests factors population women cancer quality
Topic #1:
cell gene cancer cells expression genes human protein data analysis study 10 using studies methods used results dna genetic based different associated tumor growth factor high species specific 11 breast


In [5]:
results = lda.transform(test)
totalConTop1 = 0
totalConTop2 = 0
totalDisTop1 = 0
totalDisTop2 = 0
for x in range(len(results)):
    val1 = results[x][0]
    val2 = results[x][1]
    total = val1 + val2
    print(str(labelsTest[x]) + " " + str(val1/total) + " " + str(val2/total))
    if val1 > val2:
        if labelsTest[x] == "conclusion":
            totalConTop1 += 1
        else:
            totalDisTop1 += 1
    else:
        if labelsTest[x] == "conclusion":
            totalConTop2 += 1
        else:
            totalDisTop2 += 1

conclusion 0.425296698256 0.574703301744
introduction 0.992998162435 0.00700183756485
introduction 0.698763484178 0.301236515822
conclusion 0.995039566531 0.00496043346887
conclusion 0.0678665432688 0.932133456731
introduction 0.109431350062 0.890568649938
introduction 0.992122033224 0.0078779667764
introduction 0.250278751228 0.749721248772
conclusion 0.414340295264 0.585659704736
conclusion 0.971410745881 0.0285892541186
conclusion 0.714645258281 0.285354741719
conclusion 0.886624935086 0.113375064914
introduction 0.00507546194202 0.994924538058
introduction 0.196424431113 0.803575568887
introduction 0.997179823774 0.00282017622583
conclusion 0.991367909557 0.0086320904432
conclusion 0.0117757239376 0.988224276062
conclusion 0.0198137625833 0.980186237417
introduction 0.584807724602 0.415192275398
conclusion 0.90111987536 0.0988801246404
conclusion 0.0209700626207 0.979029937379
conclusion 0.999013524042 0.000986475958101
conclusion 0.0439793773177 0.956020622682
conclusion 0.9983011

In [6]:
print("Total Conclusion Topic One: " + str(totalConTop1))
print("Total Conclusion Topic Two: " + str(totalConTop2))
print("Total Introduction Topic One: " + str(totalDisTop1))
print("Total Introduction Topic Two: " + str(totalDisTop2))

Total Conclusion Topic One: 169
Total Conclusion Topic Two: 161
Total Introduction Topic One: 157
Total Introduction Topic Two: 182


In [7]:
lda.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 50.0,
 'max_doc_update_iter': 100,
 'max_iter': 100,
 'mean_change_tol': 0.001,
 'n_jobs': 1,
 'n_topics': 2,
 'perp_tol': 0.1,
 'random_state': 0,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}

In [8]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(train.toarray(), labelsTrain)

classResults = classifier.predict(test.toarray())

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTrain[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.530642750374


In [9]:
from sklearn.svm import SVC

classifier = SVC()

classifier.fit(train, labelsTrain)

classResults = classifier.predict(test)

numRight = 0

for item in range(len(classResults)):
    if classResults[item] == labelsTrain[item]:
        numRight += 1

print(str(numRight * 1.0 / len(classResults) * 1.0))

0.478325859492
