In [2]:
from sklearn import pipeline, preprocessing
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn import ensemble
from sklearn import model_selection
from sklearn.cluster import KMeans

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import math
import numpy as np
import pandas as pd
import csv
import os

In [3]:
def directory_list_generator(prime_directory):
    """Returns a list of all non-hidden directories
    based on the path directory given, it will, return
    only directories within the folder specified"""
    directories=os.listdir(prime_directory)
    dir_list = [x for x in directories if '.' not in x]
    return dir_list

In [4]:
def text_file_tabulator(dir_list):
    """Goes through all directories given as argument list
    then picks up each text file and extracts text dumping it
    to a column"""
    paper_content = dict()
    for txtDir in dir_list:
        txtDir = prime_directory + txtDir
        for txtfile in os.listdir(txtDir): #iterate through text files in directory
            if txtfile[-3:] == 'txt':
                document_path = txtDir + '/' + txtfile
                with open(document_path) as fhand:
                    content = fhand.read()
                    paper_content[txtfile] = [content]
    return paper_content              

In [5]:
def topic_importance_pipeline(compiled_documents):
    """Pipeline to convert list of documents with text
    content from papers into a sparse matrix using count
    vectoriser."""
    
    # Create numpy array of text data from input dictionary
    text_data = []
    text_data.append([v for k,v in compiled_documents.items()])
    text_data = np.array(text_data[0])
    
    steps = [
        ('vectorizer', CountVectorizer())
            ]
    reg = pipeline.Pipeline(steps)
    ng_train_vecs = reg.fit_transform(text_data[:,0])
    df = pd.DataFrame(ng_train_vecs.todense(), columns=[cv.get_feature_names()])
    #id2word = dict((v, k) for k, v in ng_train_vecs.vocabulary_.items())
    return df, ng_train_vecs.transpose()

In [6]:
def word_counter(compiled_documents):
    """Pipeline to convert list of documents with text
    content from papers into a sparse matrix using count
    vectoriser."""
    # Create numpy array of text data from input dictionary
    text_data = []
    text_data.append([v for k,v in compiled_documents.items()])
    text_data = np.array(text_data[0])
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid'])
    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=my_stop_words, token_pattern="\\b[a-z][a-z]+\\b")
    count_vectorizer.fit(text_data[:,0])
    counts = count_vectorizer.transform(text_data[:,0]).transpose()
    id2word = dict([(v, k) for k, v in count_vectorizer.vocabulary_.items()])
    return counts, id2word,count_vectorizer,text_data

In [7]:
# Directory to parse
prime_directory = 'txt/'

In [8]:
dir_list=directory_list_generator(prime_directory)

In [9]:
compiled_documents = text_file_tabulator(dir_list)

In [10]:
counts,id2word,cv,text_data = word_counter(compiled_documents)

In [11]:
corpus = matutils.Sparse2Corpus(counts)

In [12]:
lda = models.LdaModel(corpus=corpus, num_topics=6, id2word=id2word, passes=10);

2017-08-16 21:39:34,487 : INFO : using symmetric alpha at 0.16666666666666666
2017-08-16 21:39:34,489 : INFO : using symmetric eta at 0.000200762899016
2017-08-16 21:39:34,491 : INFO : using serial LDA version on this node
2017-08-16 21:39:34,664 : INFO : running online (multi-pass) LDA training, 6 topics, 10 passes over the supplied corpus of 3 documents, updating model once every 3 documents, evaluating perplexity every 3 documents, iterating 50x with a convergence threshold of 0.001000
2017-08-16 21:39:35,511 : INFO : -9.774 per-word bound, 875.3 perplexity estimate based on a held-out corpus of 3 documents with 30345 words
2017-08-16 21:39:35,512 : INFO : PROGRESS: pass 0, at document #3/3
2017-08-16 21:39:35,606 : INFO : topic #0 (0.167): 0.011*"neural" + 0.009*"quantum" + 0.008*"dynamics" + 0.008*"network" + 0.007*"state" + 0.007*"et" + 0.006*"recurrence" + 0.004*"input" + 0.004*"neuron" + 0.004*"mean"
2017-08-16 21:39:35,607 : INFO : topic #3 (0.167): 0.015*"neural" + 0.009*"net

In [13]:
lda.print_topics(num_words=20)

2017-08-16 21:39:41,771 : INFO : topic #0 (0.167): 0.002*"neural" + 0.002*"quantum" + 0.001*"dynamics" + 0.001*"network" + 0.001*"et" + 0.001*"state" + 0.001*"recurrence" + 0.001*"input" + 0.001*"neuron" + 0.001*"mean" + 0.001*"lines" + 0.001*"neurons" + 0.001*"layer" + 0.001*"backpropagation" + 0.001*"structure" + 0.001*"quasiperiodic" + 0.001*"energy" + 0.001*"problem" + 0.001*"dynamical" + 0.001*"environment"
2017-08-16 21:39:41,772 : INFO : topic #1 (0.167): 0.001*"neural" + 0.001*"network" + 0.001*"quantum" + 0.001*"dynamics" + 0.001*"recurrence" + 0.000*"input" + 0.000*"state" + 0.000*"neurons" + 0.000*"environment" + 0.000*"et" + 0.000*"problem" + 0.000*"energy" + 0.000*"diagonal" + 0.000*"backpropagation" + 0.000*"fk" + 0.000*"given" + 0.000*"case" + 0.000*"mean" + 0.000*"learning" + 0.000*"layer"
2017-08-16 21:39:41,774 : INFO : topic #2 (0.167): 0.014*"neural" + 0.010*"quantum" + 0.010*"network" + 0.008*"dynamics" + 0.007*"state" + 0.007*"recurrence" + 0.006*"et" + 0.005*"inp

[(0,
  '0.002*"neural" + 0.002*"quantum" + 0.001*"dynamics" + 0.001*"network" + 0.001*"et" + 0.001*"state" + 0.001*"recurrence" + 0.001*"input" + 0.001*"neuron" + 0.001*"mean" + 0.001*"lines" + 0.001*"neurons" + 0.001*"layer" + 0.001*"backpropagation" + 0.001*"structure" + 0.001*"quasiperiodic" + 0.001*"energy" + 0.001*"problem" + 0.001*"dynamical" + 0.001*"environment"'),
 (1,
  '0.001*"neural" + 0.001*"network" + 0.001*"quantum" + 0.001*"dynamics" + 0.001*"recurrence" + 0.000*"input" + 0.000*"state" + 0.000*"neurons" + 0.000*"environment" + 0.000*"et" + 0.000*"problem" + 0.000*"energy" + 0.000*"diagonal" + 0.000*"backpropagation" + 0.000*"fk" + 0.000*"given" + 0.000*"case" + 0.000*"mean" + 0.000*"learning" + 0.000*"layer"'),
 (2,
  '0.014*"neural" + 0.010*"quantum" + 0.010*"network" + 0.008*"dynamics" + 0.007*"state" + 0.007*"recurrence" + 0.006*"et" + 0.005*"input" + 0.004*"neurons" + 0.004*"learning" + 0.004*"mean" + 0.004*"neuron" + 0.004*"environment" + 0.004*"energy" + 0.003*"li

In [14]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [None]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)

In [15]:
import pyLDAvis
from gensim.corpora import Dictionary, MmCorpus

In [16]:
pyLDAvis.enable_notebook()

In [17]:
dictionary = Dictionary(text_data)

2017-08-16 21:39:59,227 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-08-16 21:39:59,230 : INFO : built Dictionary(1 unique tokens: ['6\n1\n0\n2\n\n \n\np\ne\nS\n2\n2\n\n \n\n \n \n]\nE\nN\n.\ns\nc\n[\n \n \n\n1\nv\n5\n3\n9\n6\n0\n\n.\n\n9\n0\n6\n1\n:\nv\ni\nX\nr\na\n\nQuantum Neural Machine Learning -\n\nBackpropagation and Dynamics\n\nCarlos Pedro Gonçalves\n\nSeptember 23, 2016\n\nUniversity of Lisbon, Institute of Social and Political Sciences,\n\ncgoncalves@iscsp.ulisboa.pt\n\nAbstract\n\nThe current work addresses quantum machine learning in the con-\ntext of Quantum Artiﬁcial Neural Networks such that the networks’\nprocessing is divided in two stages: the learning stage, where the net-\nwork converges to a speciﬁc quantum circuit, and the backpropaga-\ntion stage where the network eﬀectively works as a self-programing\nquantum computing system that selects the quantum circuits to solve\ncomputing problems. The results are extended to general architectures\

In [45]:
lda.get_topic_terms(0)

[(2743, 0.0018074814255934944),
 (3474, 0.001543951850816323),
 (1175, 0.0014621555436023581),
 (2634, 0.0013483892513330596),
 (1476, 0.0011554152087687971),
 (4259, 0.001143955640522503),
 (3630, 0.0010131970345681615),
 (2060, 0.00081790473672190066),
 (2776, 0.00075398677076950898),
 (2509, 0.0006913783044516247)]

In [None]:
import pyLDAvis.gensim

In [None]:
matutils.ismatrix(corpus)

In [None]:
dictionary

In [None]:
yy = pyLDAvis.gensim.prepare(lda,corpus,dictionary)

In [None]:
doc_length = []
doc_length = np.array[len(each[0]) for each in text_data])

In [None]:
lda.get_document_topics(corpus)

In [None]:
len(corpus)

In [None]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [None]:
lda_docs

In [None]:
for doc in lda_corpus:
    print(doc)

In [None]:
hh = lda.get_document_topics(corpus)

In [None]:
for doc in hh:
    print (doc)

In [1]:
lda.

NameError: name 'lda' is not defined