In [9]:
import pandas as pd
import numpy as np

import gensim

from nltk.stem import WordNetLemmatizer # import nltk  --> nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset = 'train' , shuffle = True)
newsgroups_test = fetch_20newsgroups(subset = 'test', shuffle = True)

In [10]:
list(zip(range(len(newsgroups_train.target_names)),[i for i in newsgroups_train.target_names]))

[(0, 'alt.atheism'),
 (1, 'comp.graphics'),
 (2, 'comp.os.ms-windows.misc'),
 (3, 'comp.sys.ibm.pc.hardware'),
 (4, 'comp.sys.mac.hardware'),
 (5, 'comp.windows.x'),
 (6, 'misc.forsale'),
 (7, 'rec.autos'),
 (8, 'rec.motorcycles'),
 (9, 'rec.sport.baseball'),
 (10, 'rec.sport.hockey'),
 (11, 'sci.crypt'),
 (12, 'sci.electronics'),
 (13, 'sci.med'),
 (14, 'sci.space'),
 (15, 'soc.religion.christian'),
 (16, 'talk.politics.guns'),
 (17, 'talk.politics.mideast'),
 (18, 'talk.politics.misc'),
 (19, 'talk.religion.misc')]

In [11]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [12]:
def lemmatize(word):
    return WordNetLemmatizer().lemmatize(word, pos='v')

# Tokenize and lemmatize
def preprocess(text):
    result = []
    
    for token in gensim.utils.simple_preprocess(text , min_len = 3): 
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize(token))
            
    return result

In [13]:
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
doc_sample.split(' ')

print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 


['This',
 'disk',
 'has',
 'failed',
 'many',
 'times.',
 'I',
 'would',
 'like',
 'to',
 'get',
 'it',
 'replaced.']



Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replace']


In [15]:
len(newsgroups_train.data)

11314

In [19]:
newsgroups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [20]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

print(processed_docs[0])

['lerxst', 'wam', 'umd', 'edu', 'thing', 'subject', 'car', 'nntp', 'post', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'maryland', 'college', 'park', 'line', 'wonder', 'enlighten', 'car', 'saw', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'doors', 'small', 'addition', 'bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'engine', 'specs', 'years', 'production', 'car', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']


In [22]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [23]:
count = 0
for index , word in dictionary.iteritems():
    print(index, word)
    
    count += 1
    if count > 10:
        break

0 addition
1 body
2 bricklin
3 bring
4 bumper
5 call
6 car
7 college
8 day
9 door
10 doors


In [9]:
dictionary.filter_extremes(no_below = 15, no_above = 0.1, keep_n = 100000)

In [24]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

dictionary.doc2bow(processed_docs[0])

document_num = 0
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 5),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 2),
 (22, 1),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 2),
 (45, 1),
 (46, 2),
 (47, 1),
 (48, 1)]

Word 0 ("addition") appears 1 time.
Word 1 ("body") appears 1 time.
Word 2 ("bricklin") appears 1 time.
Word 3 ("bring") appears 1 time.
Word 4 ("bumper") appears 1 time.
Word 5 ("call") appears 1 time.
Word 6 ("car") appears 5 time.
Word 7 ("college") appears 1 time.
Word 8 ("day") appears 1 time.
Word 9 ("door") appears 1 time.
Word 10 ("doors") appears 1 time.
Word 11 ("early") appears 1 time.
Word 12 ("edu") appears 2 time.
Word 13 ("engine") appears 1 time.
Word 14 ("enlighten") appears 1 time.
Word 15 ("funky") appears 1 time.
Word 16 ("history") appears 1 time.
Word 17 ("host") appears 1 time.
Word 18 ("info") appears 1 time.
Word 19 ("know") appears 1 time.
Word 20 ("late") appears 1 time.
Word 21 ("lerxst") appears 2 time.
Word 22 ("line") appears 1 time.
Word 23 ("look") appears 2 time.
Word 24 ("mail") appears 1 time.
Word 25 ("maryland") appears 1 time.
Word 26 ("model") appears 1 time.
Word 27 ("neighborhood") appears 1 time.
Word 28 ("nntp") appears 1 time.
Word 29 ("orga

8 topics in the document corpus

num_topics is the number of requested latent topics to be extracted from the training corpus.
id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
workers is the number of extra processes to use for parallelization. Uses all available cores by default.
alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions

In [28]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 8,  id2word = dictionary,  
                                        passes = 10, workers = 2)

In [32]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.064*"max" + 0.009*"gun" + 0.006*"bhj" + 0.006*"giz" + 0.004*"new" + 0.003*"qax" + 0.003*"period" + 0.003*"pts" + 0.003*"bxn" + 0.002*"gld"


Topic: 1 
Words: 0.009*"com" + 0.006*"say" + 0.004*"line" + 0.004*"people" + 0.004*"subject" + 0.004*"organization" + 0.004*"go" + 0.004*"know" + 0.003*"write" + 0.003*"edu"


Topic: 2 
Words: 0.009*"com" + 0.007*"edu" + 0.007*"write" + 0.006*"article" + 0.006*"line" + 0.006*"subject" + 0.005*"like" + 0.005*"organization" + 0.004*"car" + 0.004*"turkish"


Topic: 3 
Words: 0.023*"edu" + 0.012*"line" + 0.011*"subject" + 0.010*"organization" + 0.009*"write" + 0.008*"com" + 0.008*"post" + 0.008*"article" + 0.006*"university" + 0.005*"host"


Topic: 4 
Words: 0.009*"key" + 0.006*"drive" + 0.006*"line" + 0.006*"use" + 0.006*"com" + 0.005*"chip" + 0.005*"edu" + 0.005*"subject" + 0.004*"organization" + 0.004*"know"


Topic: 5 
Words: 0.016*"edu" + 0.012*"com" + 0.012*"line" + 0.010*"subject" + 0.009*"organization" + 0.007*"file" + 0.007

Testing the data with unseen data

In [33]:
unseen_document = newsgroups_test.data[100]
print(unseen_document)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [34]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key = lambda tup: tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.1862390786409378	 Topic: 0.023*"edu" + 0.012*"line" + 0.011*"subject" + 0.010*"organization" + 0.009*"write"
Score: 0.7974031567573547	 Topic: 0.016*"edu" + 0.012*"com" + 0.012*"line" + 0.010*"subject" + 0.009*"organization"


In [35]:
print(newsgroups_test.target[100])

2


In [16]:
list(zip(range(len(newsgroups_train.target_names)),[i for i in newsgroups_train.target_names]))

[(0, 'alt.atheism'),
 (1, 'comp.graphics'),
 (2, 'comp.os.ms-windows.misc'),
 (3, 'comp.sys.ibm.pc.hardware'),
 (4, 'comp.sys.mac.hardware'),
 (5, 'comp.windows.x'),
 (6, 'misc.forsale'),
 (7, 'rec.autos'),
 (8, 'rec.motorcycles'),
 (9, 'rec.sport.baseball'),
 (10, 'rec.sport.hockey'),
 (11, 'sci.crypt'),
 (12, 'sci.electronics'),
 (13, 'sci.med'),
 (14, 'sci.space'),
 (15, 'soc.religion.christian'),
 (16, 'talk.politics.guns'),
 (17, 'talk.politics.mideast'),
 (18, 'talk.politics.misc'),
 (19, 'talk.religion.misc')]

## Coherence is used for evaluating the topics obtained

Coherence measures the relative distance between words within a topic. 
It's rare to see a coherence of 1 or +.9 unless the words being measured are either identical words or bigrams. Like United and States would likely return a coherence score of ~.94 or hero and hero would return a coherence of 1. The overall coherence score of a topic is the average of the distances between words. 

- .3 is bad

- .4 is low

- .55 is okay

- .65 might be as good as it is going to get

- .7 is nice

- .8 is unlikely and

- .9 is probably wrong

In [19]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model = CoherenceModel(model = lda_model, texts = processed_docs, dictionary = dictionary)
coherence_model.get_coherence()

0.5612157832614577