In [6]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mageshdominator/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mageshdominator/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

stopwords = set(stopwords.words('english'))

dataDict = {"content": newsgroups_train.data, "label": newsgroups_train.target}

data = pd.DataFrame(dataDict, columns=["content", "label"])

In [11]:
def tokenize_lemma_stopwords(text):
    tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
    tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords

    return tokens

def dataCleaning(data):
    data["content"] = data["content"].apply(tokenize_lemma_stopwords)
    return data

In [12]:
# LDA requires (works better with) some basic level of pre-processing
# pre-process text: text data cleanining, Tokenize and remove stop words
cleanedData = dataCleaning(data)

In [15]:
X = cleanedData["content"]
y = data["label"]

# Create a dictionary for vocabulary words with it's index and count
dictionary = gensim.corpora.Dictionary(X)

In [16]:
# filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents
# keep top 100000 frequent words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [58]:
# crete bag-of-words ==> list(index, count) for words in doctionary
bow_corpus = [dictionary.doc2bow(doc) for doc in X]

In [78]:
# Create a lda model with tf-idf vectorized corpus and dictionary
# Manually pick number of topic and then based on perplexity scoring, tune the number of topics
lda_model = models.LdaModel(bow_corpus,
                                  id2word=dictionary,
                                  num_topics=8,
                                  offset=2,
                                  random_state=100,
                                  update_every=1,
                                  passes=2,
                                  alpha='auto',
                                  eta="auto",
                                  per_word_topics=True)

lda_model.save("20_news_group.model")

In [79]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.014*"key" + 0.007*"chip" + 0.006*"encryption" + 0.006*"system" + '
  '0.005*"clipper" + 0.005*"article" + 0.004*"university" + '
  '0.004*"information" + 0.004*"government" + 0.004*"time"'),
 (1,
  '0.008*"drive" + 0.007*"university" + 0.007*"window" + 0.007*"system" + '
  '0.006*"doe" + 0.005*"card" + 0.005*"thanks" + 0.005*"space" + '
  '0.004*"article" + 0.004*"computer"'),
 (2,
  '0.010*"people" + 0.006*"gun" + 0.006*"armenian" + 0.005*"time" + '
  '0.005*"article" + 0.005*"then" + 0.005*"israel" + 0.004*"war" + '
  '0.004*"government" + 0.004*"israeli"'),
 (3,
  '0.013*"game" + 0.011*"team" + 0.008*"article" + 0.007*"university" + '
  '0.006*"player" + 0.006*"time" + 0.005*"play" + 0.005*"season" + '
  '0.004*"hockey" + 0.004*"win"'),
 (4,
  '0.015*"file" + 0.010*"program" + 0.007*"entry" + 0.005*"university" + '
  '0.005*"information" + 0.005*"window" + 0.005*"system" + 0.004*"source" + '
  '0.004*"image" + 0.004*"section"'),
 (5,
  '0.011*"car" + 0.011*"article" + 0.00

In [80]:
# perplexity is a measure of uncertainity
print('Perplexity: ', lda_model.log_perplexity(bow_corpus))

Perplexity:  -8.258929168362114


In [77]:
coherence_model_lda = models.CoherenceModel(model=lda_model, texts=X, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.42780768682023335


In [31]:
try:
    import pyLDAvis
    import pyLDAvis.gensim
except:
    !pip install pyLDAvis
    import pyLDAvis
    import pyLDAvis.genism
import matplotlib.pyplot as plt

In [82]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [84]:
vis

In [83]:
pyLDAvis.save_html(vis, "lda_vis.html")

In [56]:
# HyperPrameter tuning on LDA based on perplexity and coherence
# number of topics(K), dirichlet params alpha and beta

def compute_scores(corpus, dictionary, k, a, b):
    
    lda_model = models.LdaMulticore(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=10, 
                                   random_state=100,
                                   chunksize=10000,
                                   passes=2,
                                   alpha=a,
                                   eta=b,
                                   per_word_topics=True)
    
    perplexity = lda_model_tfidf.log_perplexity(corpus)
    coherence_model_lda = models.CoherenceModel(model=lda_model, texts=X, dictionary=dictionary, coherence='c_v')
    
    return perplexity, coherence_model_lda.get_coherence()

In [57]:
import tqdm

# Choose list of values for each hyper-params based on range and step-size
# Topics range
min_topics = 4
max_topics = 11
step_size = 2

topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.1, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.1, 1, 0.3))
beta.append('symmetric')


# Validation sets
num_of_docs = len(bow_corpus)

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 "perplexity": [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    # iterate through number of topics
    for k in tqdm.tqdm(topics_range):
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                pty, cv = compute_scores(corpus=corpus_tfidf, dictionary=dictionary, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results["perplexity"].append(pty)
                model_results['Coherence'].append(cv)
                print(k, a, b, pty, cv)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)












  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

4 0.1 0.1 -10.621791122495075 0.46877489825452906
4 0.1 0.5 -10.621802226856909 0.5116615654632813
4 0.1 symmetric -10.621855695955691 0.46877489825452906
4 0.5 0.1 -10.621919465778689 0.39552412317817953
4 0.5 0.5 -10.621756126118282 0.42272848201081664
4 0.5 symmetric -10.621805888653547 0.39552412317817953
4 symmetric 0.1 -10.621944517338164 0.46877489825452906
4 symmetric 0.5 -10.621816787376048 0.5116615654632813
4 symmetric symmetric -10.6218790330551 0.46877489825452906
4 asymmetric 0.1 -10.62187005553834 0.42891145463948915
4 asymmetric 0.5 -10.62178263584687 0.42992378030577727













 25%|██▌       | 1/4 [25:38<1:16:54, 1538.00s/it][A[A[A[A[A[A[A[A[A[A[A

4 asymmetric symmetric -10.62187398713996 0.42891145463948915
6 0.1 0.1 -10.621822498119576 0.46877489825452906
6 0.1 0.5 -10.621900303711367 0.5116615654632813
6 0.1 symmetric -10.621759794888924 0.46877489825452906
6 0.5 0.1 -10.62186473689666 0.39552412317817953
6 0.5 0.5 -10.621882313972272 0.42272848201081664
6 0.5 symmetric -10.621829639941199 0.39552412317817953
6 symmetric 0.1 -10.621916992793505 0.46877489825452906
6 symmetric 0.5 -10.621889197945386 0.5116615654632813
6 symmetric symmetric -10.621868205159839 0.46877489825452906
6 asymmetric 0.1 -10.62197839331956 0.42891145463948915
6 asymmetric 0.5 -10.621831957726249 0.42992378030577727













 50%|█████     | 2/4 [52:17<51:53, 1556.57s/it]  [A[A[A[A[A[A[A[A[A[A[A

6 asymmetric symmetric -10.62185720489664 0.42891145463948915
8 0.1 0.1 -10.62175969804023 0.46877489825452906
8 0.1 0.5 -10.621916688542782 0.5116615654632813
8 0.1 symmetric -10.621785717130072 0.46877489825452906
8 0.5 0.1 -10.621872486764794 0.39552412317817953
8 0.5 0.5 -10.621964813199849 0.42272848201081664
8 0.5 symmetric -10.62192316769978 0.39552412317817953
8 symmetric 0.1 -10.621897336875394 0.46877489825452906
8 symmetric 0.5 -10.621908429169757 0.5116615654632813
8 symmetric symmetric -10.62184367937248 0.46877489825452906
8 asymmetric 0.1 -10.621965477917815 0.42891145463948915
8 asymmetric 0.5 -10.621920238422057 0.42992378030577727













 75%|███████▌  | 3/4 [1:18:34<26:02, 1562.70s/it][A[A[A[A[A[A[A[A[A[A[A

8 asymmetric symmetric -10.621876542688724 0.42891145463948915
10 0.1 0.1 -10.621816120880249 0.46877489825452906
10 0.1 0.5 -10.62189918692533 0.5116615654632813
10 0.1 symmetric -10.621781784311578 0.46877489825452906
10 0.5 0.1 -10.62183157196931 0.39552412317817953
10 0.5 0.5 -10.621809598064454 0.42272848201081664
10 0.5 symmetric -10.621916312852072 0.39552412317817953
10 symmetric 0.1 -10.621824866108726 0.46877489825452906
10 symmetric 0.5 -10.62185858402522 0.5116615654632813
10 symmetric symmetric -10.621922933571568 0.46877489825452906
10 asymmetric 0.1 -10.621852440970713 0.42891145463948915
10 asymmetric 0.5 -10.62188702731714 0.42992378030577727













100%|██████████| 4/4 [1:44:48<00:00, 1572.19s/it][A[A[A[A[A[A[A[A[A[A[A

10 asymmetric symmetric -10.621869059727446 0.42891145463948915



