In [14]:
# We will be using the 20-Newsgroups dataset for this exercise. 
#This version of the dataset contains about 11k newsgroups posts from 20 different topics.

In [15]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [16]:
# Import Dataset
df = pd.read_json('newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [17]:
# Convert to list
data = df.content.values.tolist()

In [18]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

In [19]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [20]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [21]:
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [22]:
#After removing the emails and extra spaces, the text still looks messy. 
#It is not ready for the LDA to consume. You need to break down each sentence into a
#list of words through tokenization, while clearing up all the messy text in the process.

In [23]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [24]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [25]:
data_words = list(sent_to_words(data))

In [26]:
print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [27]:
#Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.
#
#Some examples in our example are: ‘front_bumper’, ‘oil_leak’, ‘maryland_college_park’ etc.

In [28]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

In [29]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [30]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [31]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [32]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/madsh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [33]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [34]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [35]:
# Run in terminal or command prompt
# python3 -m spacy download en

In [36]:
import spacy

2022-05-06 10:47:28.010324: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-06 10:47:28.010350: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [37]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

In [39]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [40]:
# Might take a little while to get here.. A couple of minutes perhaps...

In [41]:
print(data_lemmatized[:1])

[['thing', 'car', 'nntp_poste', 'host', 'rac_wam', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [42]:
#The two main inputs to the LDA topic model are the dictionary(id2word) 
# and the corpus. Let’s create them.

In [43]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [44]:
# Create Corpus
texts = data_lemmatized

In [45]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [46]:
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]]


In [47]:
#Gensim creates a unique id for each word in the document. 
#The produced corpus shown above is a mapping of (word_id, word_frequency).

#For example, (0, 1) above implies, word id 0 occurs once in the first document.
#Likewise, word id 1 occurs twice and so on

In [48]:
#If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.
id2word[0]

'addition'

In [49]:
#Or, you can see a human-readable form of the corpus itself.
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front_bumper', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('rac_wam', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('wonder', 1),
  ('year', 1)]]

In [50]:
#We have everything required to train the LDA model. 
#In addition to the corpus and dictionary, you need to provide the number of topics as well.

#chunksize is the number of documents to be used in each training chunk.
#update_every determines how often the model parameters should be updated 
#and passes is the total number of training passes.

In [51]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [52]:
# Might take a little while to get here.. A couple of minutes perhaps...

In [53]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.022*"get" + 0.022*"know" + 0.020*"go" + 0.019*"good" + 0.019*"think" + '
  '0.018*"make" + 0.017*"time" + 0.017*"say" + 0.016*"see" + 0.015*"well"'),
 (1,
  '0.059*"amount" + 0.041*"insurance" + 0.041*"pack" + 0.040*"warranty" + '
  '0.034*"dog" + 0.031*"water" + 0.030*"workstation" + 0.029*"damage" + '
  '0.025*"probe" + 0.021*"mph"'),
 (2,
  '0.163*"line" + 0.112*"write" + 0.101*"organization" + 0.084*"article" + '
  '0.064*"nntp_poste" + 0.058*"host" + 0.032*"reply" + 0.029*"thank" + '
  '0.020*"university" + 0.015*"post"'),
 (3,
  '0.022*"people" + 0.022*"believe" + 0.019*"evidence" + 0.019*"reason" + '
  '0.018*"say" + 0.013*"claim" + 0.013*"question" + 0.012*"mean" + '
  '0.011*"fact" + 0.011*"sense"'),
 (4,
  '0.114*"team" + 0.108*"game" + 0.075*"play" + 0.042*"year" + 0.037*"season" '
  '+ 0.029*"trade" + 0.028*"score" + 0.022*"division" + 0.019*"pen" + '
  '0.018*"baseball"'),
 (5,
  '0.049*"file" + 0.047*"program" + 0.045*"window" + 0.024*"software" + '
  '0.023*"im

In [54]:
# How to interpret this?
#
# Looking at one topic you have something like:
#'0.131*"space" + 0.039*"earth" + 0.035*"launch" + 0.032*"nhl" + '
#""  '0.031*"orbit" + 0.030*"mission" + 0.029*"moon" + 0.027*"mouse" + '
#  '0.025*"satellite" + 0.023*"period"'),

#It means the top 10 keywords that contribute to this topic are: ‘space’, ‘earth’, ‘launch’.. and so on 
#and the weight of ‘space’ on topic 0 is 0.131.

#The weights reflect how important a keyword is to that topic.

#Looking at these keywords, can you guess what this topic could be? 
#You may summarise it either are ‘space’ or ‘missions to space’ ?