# topic model

In [1]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Load the list of documents
with open('newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(newsgroup_data)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())

Using TensorFlow backend.


In [2]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=10,id2word=id_map,random_state=34,passes=25)

In [3]:
def lad_topics():
    return ldamodel.print_topics(num_topics=10,num_words=10)

In [4]:
lad_topics()

[(0,
  '0.056*"edu" + 0.043*"com" + 0.033*"thanks" + 0.022*"mail" + 0.021*"know" + 0.020*"does" + 0.014*"info" + 0.012*"monitor" + 0.010*"looking" + 0.010*"don"'),
 (1,
  '0.024*"ground" + 0.018*"current" + 0.018*"just" + 0.013*"want" + 0.013*"use" + 0.011*"using" + 0.011*"used" + 0.010*"power" + 0.010*"speed" + 0.010*"output"'),
 (2,
  '0.061*"drive" + 0.042*"disk" + 0.033*"scsi" + 0.030*"drives" + 0.028*"hard" + 0.028*"controller" + 0.027*"card" + 0.020*"rom" + 0.018*"floppy" + 0.017*"bus"'),
 (3,
  '0.023*"time" + 0.015*"atheism" + 0.014*"list" + 0.013*"left" + 0.012*"alt" + 0.012*"faq" + 0.012*"probably" + 0.011*"know" + 0.011*"send" + 0.010*"months"'),
 (4,
  '0.025*"car" + 0.016*"just" + 0.014*"don" + 0.014*"bike" + 0.012*"good" + 0.011*"new" + 0.011*"think" + 0.010*"year" + 0.010*"cars" + 0.010*"time"'),
 (5,
  '0.030*"game" + 0.027*"team" + 0.023*"year" + 0.017*"games" + 0.016*"play" + 0.012*"season" + 0.012*"players" + 0.012*"win" + 0.011*"hockey" + 0.011*"good"'),
 (6,
  '0.0

In [5]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [6]:


def topic_distribution():
    
    # Your Code Here
    from gensim import corpora, models
    vect = CountVectorizer(stop_words='english')
    new_X = vect.fit_transform(new_doc)
    new_corpus = gensim.matutils.Sparse2Corpus(new_X, documents_columns=False)
    new_ldamodel = gensim.models.ldamodel.LdaModel(new_corpus,num_topics=10,id2word=id_map,random_state=34,passes=25)

    dictionary = corpora.Dictionary(vect.vocabulary_.items())
    bow = dictionary.doc2bow(new_doc[0].split())
    
    return new_ldamodel.get_document_topics(bow)



In [7]:
topic_distribution()

[(0, 0.010001302256864946),
 (1, 0.010001282492785707),
 (2, 0.010001284113143083),
 (3, 0.010001277543918679),
 (4, 0.010001346196707643),
 (5, 0.010001323314196269),
 (6, 0.90998804975905201),
 (7, 0.010001379816974685),
 (8, 0.010001428077691074),
 (9, 0.010001326428665924)]

In [8]:
def topic_names():
    
    # Your Code Here
    
    return ldamodel.print_topics()

In [None]:
topic_names()