In [1]:
from sklearn import pipeline, preprocessing
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn import ensemble
from sklearn import model_selection
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

from scipy import sparse

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pyLDAvis
import pyLDAvis.sklearn
from gensim import matutils

import math
import numpy as np
import pandas as pd
import csv
import os

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
2017-08-21 15:42:59,615 : INFO : 'pattern' package not found; tag filters are not available for English


In [2]:
def directory_list_generator(prime_directory):
    """Returns a list of all non-hidden directories
    based on the path directory given, it will, return
    only directories within the folder specified"""
    directories=os.listdir(prime_directory)
    dir_list = [x for x in directories if '.' not in x]
    return dir_list

In [3]:
def text_file_tabulator(dir_list):
    """Goes through all directories given as argument list
    then picks up each text file and extracts text dumping it
    to a column"""
    paper_content = dict()
    for txtDir in dir_list:
        txtDir = prime_directory + txtDir
        for txtfile in os.listdir(txtDir): #iterate through text files in directory
            if txtfile[-3:] == 'txt':
                document_path = txtDir + '/' + txtfile
                with open(document_path) as fhand:
                    content = fhand.read()
                    paper_content[txtfile] = [content]
    return paper_content              

In [4]:
def word_counter(compiled_documents):
    """Pipeline to convert list of documents with text
    content from papers into a sparse matrix using count
    vectoriser."""
    # Create numpy array of text data from input dictionary
    text_data = []
    text_data.append([v for k,v in compiled_documents.items()])
    text_data = np.array(text_data[0])
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid'])
    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=my_stop_words, token_pattern="\\b[a-z][a-z]+\\b")
    counts = count_vectorizer.fit_transform(text_data[:,0])
    return counts, count_vectorizer,text_data

In [5]:
def word_counter_fromCSV(df):
    """Pipeline to convert list of documents with text
    content from papers into a sparse matrix using count
    vectoriser."""
    # Create numpy array from inputted dataframe
    text_data = df.values
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid','et','et al','al', 'yes', 'method',
                                                   'results','citation','use','used','submitted','published'])
    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(2,3), 
                                       stop_words=my_stop_words, 
                                       token_pattern="\\b[a-z][a-z]{2,15}\\b",
                                       min_df=5,max_df=30)
    counts = count_vectorizer.fit_transform(text_data[:,0])
    return counts, count_vectorizer,text_data

In [6]:
def tfidf_vectorizer_fromCSV(df):
    """Uses tfidf algorithm to return word frequency by
    document."""

    # Create numpy array from inputted dataframe
    text_data = df.values
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid','et','et al','al', 'yes', 'method',
                                                   'results','citation','use','used','submitted','published'])


    # Vectorize the text using TFIDF
    tfidf = TfidfVectorizer(ngram_range=(2,2), stop_words=my_stop_words, 
                            token_pattern="\\b[a-zA-Z][a-zA-Z]{2,15}\\b", 
                            min_df=5,max_df=30)
    tfidf_vecs = tfidf.fit_transform(text_data[:,0])
    return tfidf_vecs, tfidf,text_data

In [7]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [18]:
def print_top_grams_bydoc(result,model, feature_names,n_docs, n_top_words):
    for i in range(n_docs):
        doc_topic = np.argmax(result[i])
        message = "Document #%d, top topic #%d: " % (i,doc_topic)
        word_topic = model.components_[doc_topic]
        message += ", ".join([feature_names[i]
                     for i in word_topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [52]:
a={'jacob':{'text': 1212,'doc_type':'paper'}}

In [53]:
b={'jake':{'text': 2323,'doc_type':'video'}}

In [58]:
c={'Tom':{'text': 4343,'doc_type':'video'}}

In [71]:
from collections import defaultdict
x = defaultdict(dict)

TypeError: descriptor 'keys' of 'dict' object needs an argument

In [69]:
x['hw']['filetype'] =2

In [72]:
x['hw']

{'filetype': 2}

In [59]:
z = {**z,**c}

In [60]:
z

{'Tom': {'doc_type': 'video', 'text': 4343},
 'jacob': {'doc_type': 'paper', 'text': 1212},
 'jake': {'doc_type': 'video', 'text': 2323}}

In [19]:
df = pd.DataFrame.from_csv('text_df.csv')

In [51]:
df.head()

Unnamed: 0,0
0,6\n1\n0\n2\n\n \nr\na\n\n \n\nM\n3\n2\n\n \n \...
1,Chapter pre-print to appear in the Oxford Hand...
2,Transferability in Machine Learning: from Phen...
3,7\n1\n0\n2\n\n \nl\nu\nJ\n \n\n6\n1\n\n \n \n]...
4,4\n1\n0\n2\n\n \nt\nc\nO\n7\n2\n\n \n\n \n \n]...


## Count Vec with LDA

In [31]:
dtm_tf, tf_vectorizer,text_data = word_counter_fromCSV(df)

In [32]:
sp_dtm_tf = sparse.csr_matrix(dtm_tf)

In [41]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=25, random_state=0)
lda_fitted = lda_tf.fit_transform(sp_dtm_tf)



In [42]:
print("\nTopics in NMF model:")
countvec_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tf, countvec_feature_names, 5)


Topics in NMF model:
Topic #0: scikit learn, hyper parameter, grid search, true true, learning patterns
Topic #1: feature set, feature sets, target function, error driven, set feature
Topic #2: sequence learning, meta learning, internal representation, representation space, universal learning
Topic #3: text categorization, adversarial examples, train test, naive bayesian, adversarial training
Topic #4: curriculum learning, meta learning, structured prediction, cost sensitive, meta data
Topic #5: evolutionary computation, learning path, domain adaptation, activation functions, noise level
Topic #6: multi label, new classes, extreme learning, new class, extreme learning machine
Topic #7: social network, social networks, confounding factors, action recognition, convolutional layer
Topic #8: scikit learn, low level, https www, metric learning, main features
Topic #9: quantum machine, quantum machine learning, phys rev, learning quantum, quantum computer
Topic #10: learning rule, hidden un

Document #0, top topic #0: scikit learn, hyper parameter
Document #1, top topic #9: quantum machine, quantum machine learning
Document #2, top topic #16: multi task, metric learning
Document #3, top topic #5: evolutionary computation, learning path
Document #4, top topic #22: rule based, based machine
Document #5, top topic #15: graphical models, exponential family
Document #6, top topic #0: scikit learn, hyper parameter
Document #7, top topic #17: empirical risk, matrix factorization
Document #8, top topic #17: empirical risk, matrix factorization
Document #9, top topic #10: learning rule, hidden units



## TFIDF with LDA

In [46]:
dtm_tf, tf_vectorizer,text_data = tfidf_vectorizer_fromCSV(df)

In [47]:
sp_dtm_tf = sparse.csr_matrix(dtm_tf)

In [48]:
lda_tf = LatentDirichletAllocation(n_topics=25, random_state=0)
lda_fitted = lda_tf.fit_transform(sp_dtm_tf)



In [49]:
print("\nTopics in NMF model:")
countvec_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tf, countvec_feature_names, 5)


Topics in NMF model:
Topic #0: knowledge base, epoch epoch, random samples, shalev shwartz, new classes
Topic #1: dictionary learning, sparse coding, multi label, tree structured, scikit learn
Topic #2: contextual bandit, learning rule, agent environment, natural images, perceptron learning
Topic #3: mutual information, feature sets, learning rates, learning extract, feature set
Topic #4: contextual features, context sensitive, primary features, curriculum learning, contextual information
Topic #5: phys rev, quantum machine, rev lett, quantum learning, quantum algorithm
Topic #6: learning patterns, feature map, marginal likelihood, feature maps, visual tracking
Topic #7: tensor factorization, amino acid, feature engineering, feature matrix, parameter server
Topic #8: feedback signals, boltzmann machine, hidden units, ensemble learning, target concept
Topic #9: materials science, mit edu, students learning, hidden neurons, naive bayesian
Topic #10: liu wang, risk minimization, sequence

In [50]:
print_top_grams_bydoc(lda_fitted,lda_tf, countvec_feature_names, 10,2)

Document #0, top topic #22: message passing, design matrix
Document #1, top topic #15: confusion matrix, grid search
Document #2, top topic #17: train test, hyper parameter
Document #3, top topic #24: social network, given query
Document #4, top topic #22: message passing, design matrix
Document #5, top topic #5: phys rev, quantum machine
Document #6, top topic #5: phys rev, quantum machine
Document #7, top topic #3: mutual information, feature sets
Document #8, top topic #4: contextual features, context sensitive
Document #9, top topic #15: confusion matrix, grid search



## pyLDAvis

In [31]:
pyLDAvis.enable_notebook()

In [32]:
LDA_plot = pyLDAvis.sklearn.prepare(lda_tf, sp_dtm_tf, tf_vectorizer)

In [33]:
pyLDAvis.save_html(LDA_plot,'lda_features_anlysis_20features.html')

In [40]:
lda_fitted.

<bound method BaseEstimator.get_params of LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=25, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)>