In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pickle

In [25]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [29]:
n_samples = 2000
n_features = 1000
n_topics = 20
n_top_words = 20

In [30]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                            remove=('headers', 'footers', 'quotes'))

In [31]:
data_samples = dataset.data[:n_samples]

In [32]:
def unpickle_data(filename):
    file = open(filename, "rb")
    data = pickle.load(file)
    file.close()
    return data

def pickle_data(data, filename):
    file = open(filename, "wb")
    pickle.dump(data, file)
    file.close()

In [33]:
pickle_data(dataset, "20newsgroups.dat")

In [34]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                max_features=n_features, 
                                stop_words='english')

tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, 
                               learning_method='online', learning_offset=5,
                               random_state=0)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=5,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [35]:
dataset.data[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [36]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: edu com mail send list cs message uk ac sun navy mit org stuff soon news address article rules request
Topic #1: mr bios 000 armenian armenians home turkish local like bus water numbers rate hot azerbaijan control dot equipment don armenia
Topic #2: year years oil hit team driving night runs defense leafs run better late sold james air hope player start pretty
Topic #3: card 00 car new engine power drivers driver sale monitor apple computer performance price gas video offer software brake sell
Topic #4: health hiv aids disease medical care display information research national study drug 1993 april page new service cost public need
Topic #5: israel win jews men attacks israeli able text policy soldiers conference members allow women jewish blood different mike people small
Topic #6: game play team greek opinions players mark goal pittsburgh new nhl hockey eric win games player early got playing bob
Topic #7: 500 board bike car cars color insurance model 