# pyLDAvis

In [7]:
import os
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import matplotlib
import gensim
from gensim import corpora
import pyLDAvis.gensim
import sys
reload(sys)
sys.setdefaultencoding('utf-8')




In [8]:
def read_files_to_dict(dir_path):
    """
    Read file in specified path
    :param file_path: Path of file to read
    :return: List of characters conversations
    """
    conversations = {}
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"):
            file_path = os.path.abspath(os.path.join(dir_path, filename))
            with open(file_path, 'r') as f:
                x = f.readlines()
                content_unicode = unicode(x[0], encoding='utf-8', errors='replace')
                if not filename in conversations:
                    conversations[filename.split('.')[0]] = content_unicode
    return conversations


In [2]:
def preprocess(doc, stopwords, punctuation, lemma):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stopwords])
    punc_free = ''.join(ch for ch in stop_free if ch not in punctuation)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())

    return normalized

In [3]:
def get_doc_term_matrix(doc_clean):
    # Creating the term dictionary of our corpus, where every unique term is assigned an index.
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary, doc_term_matrix

In [4]:
def get_lda_topics(num_topics, passes, dictionary, doc_term_matrix, n_top_terms):
    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel

    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=passes)

    topic_words = []

    for i in range(num_topics):
        tt = ldamodel.get_topic_terms(i, n_top_terms)
        topic_words.append([dictionary[pair[0]] for pair in tt])

    return ldamodel, topic_words


In [8]:
dir_path = r"./Spongebob NLP"
conversations = read_files_to_dict(dir_path)
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

normalized_conversations = []

In [9]:
for doc in conversations.values():
    normalized = preprocess(doc=doc, stopwords=stop, punctuation=exclude, lemma=lemma)
    normalized_conversations.append(normalized.split())

    dictionary, doc_term_matrix = get_doc_term_matrix(doc_clean=normalized_conversations)

In [11]:
ldamodel, topics = get_lda_topics(num_topics=3,
                                  passes=50,
                                  dictionary=dictionary,
                                  doc_term_matrix=doc_term_matrix,
                                  n_top_terms=20)

In [12]:
ldamodel.print_topics(num_topics=3, num_words=3)

[(0, u'0.013*"im" + 0.010*"get" + 0.008*"spongebob"'),
 (1, u'0.002*"class" + 0.001*"school" + 0.001*"boating"'),
 (2, u'0.025*"spongebob" + 0.010*"patrick" + 0.009*"krabs"')]

In [13]:
pyLDAvis.enable_notebook()

In [14]:
vis_data = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

In [15]:
pyLDAvis.show(vis_data)