In [1]:
from __future__ import print_function
from time import time
import Utils as Utils
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from sklearn.model_selection import train_test_split


import pickle 
import Configuration

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

def print_top_words(model, feature_names, n_top_words):
    '''Prints the n top words from the model.

    :param model: The model
    :param feature_names: The feature names
    :param n_top_words: Number of top words
    '''
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


def run_LDA(corpus,num_topics=10,maximum_iterations=10, learn_offfset=10.0):
      
    t0 = time()

    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=2, 
                                    stop_words='english', 
                                    analyzer='word')

    print("Fitting LDA models with tf features")
    tf = tf_vectorizer.fit_transform(corpus)
    print("     done in %0.3fs." % (time() - t0))

    #2. Define the model
    lda = LatentDirichletAllocation(n_topics=num_topics, 
                                    learning_method='online',
                                    max_iter=maximum_iterations,
                                    learning_offset=learn_offfset,
                                    random_state=0)

    # 3. Train the model
    t0 = time()
    print("training the model")
    lda.fit(tf)
    print("   done in %0.3fs." % (time() - t0))

     # Print LDA model
    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()

    print_top_words(lda, tf_feature_names, n_top_words=10)

    return lda, tf,tf_vectorizer

if __name__ == "__main__":

    params = Configuration.Parameters()
    file_locations=params.file_locations
    param = params.param
  
    clean_string = True
    remove_stop_words=False
    retrain=True

    dataset="Security_data"
    n_topics=10

    data_folder_security=["results\char_trigram_binary\security_data.txt"] 
    #[params.baseline_locations["tmp"][0]]# ["results\Security_data.txt"] #[params.baseline_locations["Nulled_Binary"][1]]
    docs, vocab,_ = Utils.load(data_folder_security,clean_string=clean_string,remove_stop_words=remove_stop_words,split_for_cv=True)
    max_l= Utils.get_dataset_details(docs,vocab)
    data = [ sent["text"] for sent in docs]
    
    if(retrain):
        lda, tf,tf_vectorizer=run_LDA(data,num_topics=n_topics,maximum_iterations=10)
        #Utils.save_pickle([lda, tf,tf_vectorizer],dataset+str(n_topics)+"_topics.pkl")
    else:
        x=Utils.load_pickle("lda_models\\"+dataset+str(n_topics)+"_topics.pkl")
        lda, tf,tf_vectorizer=x[0],x[1],x[2]
        
    pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
  




Number of empty sentences : 0
data loaded!
         number of documents: 85796
         vocab size: 422329
         max document length: 250
         avg document length: 71.0850972073
         min document length: 1
         max char length: 32761
         avg char length: 419.945486969
         min char length: 3
Extracting tf features for LDA...
Fitting LDA models with tf features
     done in 6.231s.
training the model
   done in 774.591s.

Topics in LDA model:
Topic #0:
gt lt legends bot user login download pass amp http
Topic #1:
use script banned just game did working rat work try
Topic #2:
file download com www http virustotal hide analysis amp https
Topic #3:
spoiler added new fixed bot version clashbot features download attack
Topic #4:
com gmail hotmail yahoo hide net aol uk live fr
Topic #5:
hide com https http www download bol nulled cracked file
Topic #6:
just like use need account thanks want password accounts make
Topic #7:
hide 123456a 123456789a abc123 qwerty123 1q2w3

  chunks = self.iterencode(o, _one_shot=True)
