In [1]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 458 kB/s  eta 0:00:01   |█████▏                          | 2.0 MB 8.0 MB/s eta 0:00:02     |███████████████▌                | 6.2 MB 8.0 MB/s eta 0:00:01     |████████████████▋               | 6.6 MB 8.0 MB/s eta 0:00:01     |████████████████████▎           | 8.1 MB 12.3 MB/s eta 0:00:01     |███████████████████████████▊    | 11.1 MB 12.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'

In [5]:
# combining all the documents into a list:
corpus = [D1, D2, D3, D4, D5]

In [6]:
corpus

['I want to watch a movie this weekend.',
 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.',
 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.',
 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!',
 'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.']

### 2. Text Preprocessing

Steps to preprocess text data:

1. Convert the text into lowercase
2. Split text into words
3. Remove the stop  words
3. Remove the Punctuation, any symbols and special characters
4. Lemmatization

In [7]:
# Apply Preprocessing on the Corpus

# stop  words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [8]:
clean_corpus

[['want', 'watch', 'movie', 'weekend'],
 ['went',
  'shopping',
  'yesterday',
  'new',
  'zealand',
  'world',
  'test',
  'championship',
  'beating',
  'india',
  'eight',
  'wicket',
  'southampton'],
 ['don’t',
  'watch',
  'cricket',
  'netflix',
  'amazon',
  'prime',
  'good',
  'movie',
  'watch'],
 ['movie',
  'nice',
  'way',
  'chill',
  'however',
  'time',
  'would',
  'like',
  'paint',
  'read',
  'good',
  'book',
  'it’s',
  'long'],
 ['blueberry',
  'milkshake',
  'good',
  'try',
  'reading',
  'dr',
  'joe',
  'dispenza’s',
  'book',
  'work',
  'gamechanger',
  'book',
  'helped',
  'learn',
  'much',
  'thought',
  'impact',
  'biology',
  'rewire',
  'brain']]

### 3. Convert Text into Numerical Representation

Converting the clean preprocessed corpus to array

In [9]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

tf_idf_vectorizer

In [10]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

In [11]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf



['amazon',
 'beating',
 'biology',
 'blueberry',
 'book',
 'brain',
 'championship',
 'chill',
 'cricket',
 'dispenza’s',
 'don’t',
 'dr',
 'eight',
 'gamechanger',
 'good',
 'helped',
 'however',
 'impact',
 'india',
 'it’s',
 'joe',
 'learn',
 'like',
 'long',
 'milkshake',
 'movie',
 'much',
 'netflix',
 'new',
 'nice',
 'paint',
 'prime',
 'read',
 'reading',
 'rewire',
 'shopping',
 'southampton',
 'test',
 'thought',
 'time',
 'try',
 'want',
 'watch',
 'way',
 'weekend',
 'went',
 'wicket',
 'work',
 'world',
 'would',
 'yesterday',
 'zealand']

In [12]:
display(len(vocab_tf_idf))

52

### 4. Implementation of LDA

To implement LDA, pass the corpus: document-term matrix to the model. We had above obtained the unique words of vocabulary using both TF-IDF and Count Vectorizer. We can continue with either as have the same unique words in both the obtained vocabularies.

In [13]:
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 6, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

### 4a. Retrieve the Topics

In [14]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 5

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['movie' 'good' 'watch' 'book']
Topic 2 ['zealand' 'test' 'beating' 'world']
Topic 3 ['weekend' 'want' 'watch' 'movie']
Topic 4 ['watch' 'amazon' 'cricket' 'don’t']
Topic 5 ['movie' 'good' 'watch' 'book']
Topic 6 ['however' 'chill' 'would' 'it’s']


### 4b. Annotating the topics the documents

In [15]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 2
Document 2  -- Topic: 1
Document 3  -- Topic: 3
Document 4  -- Topic: 5
Document 5  -- Topic: 2
