In [9]:
from __future__ import print_function


from time import time
import Utils as Utils
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from sklearn.model_selection import train_test_split

import pyLDAvis
import pyLDAvis.sklearn 
pyLDAvis.enable_notebook()
import pickle 

import Configuration 

n_samples = 10000
def print_topics (lda_model, tf_vectorizer, num_words_per_topic, store_topics=False):
    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda_model.components_):
        topic_no ="Topic #%d:" % topic_idx
        topic_kw = " ".join([tf_feature_names[i] for i in topic.argsort()[:-num_words_per_topic - 1:-1]])
        print(topic_no)
        print(''.join([tf_feature_names[i] + ' ' + str(round(topic[i]/ sum(topic),4 ))  +' | ' for i in topic.argsort()[:-num_words_per_topic - 1:-1]]))
        #print(topic_kw)
        print()
        if(store_topics==True):
          Utils.write_to_file("topics.txt",topic_no+"\n","a")
          Utils.write_to_file("topics.txt",topic_kw+"\n","a")

def print_lda_parameters(num_topics=10, num_top_words=10, num_features=30000,heldout_data_size =0.33):
    print("Running Topic Models...")
    print("          Number of Topics : " + str(num_topics))
    print("          Number of Words per Topics : " + str(num_top_words))
    print("          Maximum  Number of Features : " + str(num_features))

def store_topics(lda,tf,corpus_docs):
     ##Store the data according to their topiccs
    print("Storing data to files ")
    doc_topic= lda.transform(tf)
    num_docs, num_topics= doc_topic.shape
    for doc_idx in range(num_docs):
        doc_most_pr_topic = np.argmax(doc_topic[doc_idx])
        document_text = corpus_docs[doc_idx] 
        file_location = "Results/topic_" + str(doc_most_pr_topic) + "_documents.txt"
        Utils.write_to_file(file_location,document_text,"a")
    print("data stored to corrensponding files") 

#def visualize_topics ():
#    x = pickle.load(open("lda_model_Nulled.pkl","rb"))
#    print("LDA model loaded ")
#    lda, tf, tf_vectorizer= x[0], x[1], x[2]
#    lda.fit(tf)
#    pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
#    print("prepared...!")

############ Main method #########
def build_LDA_model(corpus_data, num_topics=10, max_n_features =30000, max_df=0.95, min_df=1, max_iter=5):
    
    ''' 
         common English words are removed,
         words occurring in min_df(e.g=1) documents are removed 
         words occurring in at least 95% of the documents are removed.

    '''
   
    t0 = time()
    #1  Feature Extaction from raw data. Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                    max_features=max_n_features,
                                    stop_words='english')
    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (len(corpus_data), max_n_features))
    tf = tf_vectorizer.fit_transform(corpus_data)
    print("     done in %0.3fs." % (time() - t0))


    #2. Build the model
    lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=max_iter,
                                    learning_method='online',
                                    #learning_offset=50.,
                                    random_state=0)
    

    # 3. Train the model
    t0 = time()
    print("training the model")
    lda.fit(tf)
    print("   done in %0.3fs." % (time() - t0))
    pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    return lda, tf,tf_vectorizer


def run_LDA(corpus_data, evaluate=False,evaluation_data=[],  num_topics=10, n_top_words=10, max_n_features =30000, max_df=0.95, min_df=1, max_iter=5,
            show_topics=True,store=True):
    print_lda_parameters(num_topics, n_top_words, max_n_features)

    # 1. Preprocessing the data 

    # 2. Build the Model 
    lda,tf,tf_vectorizer=build_LDA_model(corpus_data,
                                         num_topics, max_n_features,
                                         max_df, min_df,max_iter)
    
    pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    pickle.dump([lda,tf,tf_vectorizer ], open("lda_model_Nulled.pkl", "wb"))
    return lda,tf,tf_vectorizer 

params = Configuration.Parameters()
file_locations=params.file_locations
param = params.param

data_folder_security=params.baseline_locations["tmp"]#["results\Security_data.txt"] #[params.baseline_locations["Nulled_Binary"][1]]
docs, vocab = Utils.load(data_folder_security,clean_string=True,remove_stop_words=False,split_for_cv=True)
max_l= Utils.get_dataset_details(docs,vocab)
labels = [sent["y"] for sent in docs]
data = [ sent["text"] for sent in docs]

run_LDA(data,evaluate=False,evaluation_data=[],
                  num_topics=30, n_top_words=10, max_n_features=50000, 
                  max_df=0.75, min_df=1, max_iter=5,
                  show_topics=True, store=False)

Number of empty sentences : 0
data loaded!
         number of documents: 9999
         vocab size: 129284
         max document length: 250
         avg document length: 61.095209521
         min document length: 1
         max char length: 6752
         avg char length: 396.772277228
         min char length: 1
Running Topic Models...
          Number of Topics : 30
          Number of Words per Topics : 10
          Maximum  Number of Features : 50000
Extracting tf features for LDA...
Fitting LDA models with tf features, n_samples=9999 and n_features=50000...
     done in 0.829s.
training the model
   done in 53.805s.


(LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
              evaluate_every=-1, learning_decay=0.7,
              learning_method='online', learning_offset=10.0,
              max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
              n_jobs=1, n_topics=30, perp_tol=0.1, random_state=0,
              topic_word_prior=None, total_samples=1000000.0, verbose=0),
 <9999x50000 sparse matrix of type '<class 'numpy.int64'>'
 	with 224633 stored elements in Compressed Sparse Row format>,
 CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.75, max_features=50000, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))