# Mount the Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!python -m pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [3]:
!python -m pip install --user spacy==3.1.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

## 1. Import the required libraries

In [4]:
import re
import spacy

In [5]:
!pip install -U wn==0.0.22

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [6]:
!pip show nltk

Name: nltk
Version: 3.7
Summary: Natural Language Toolkit
Home-page: https://www.nltk.org/
Author: NLTK Team
Author-email: nltk.team@gmail.com
License: Apache License, Version 2.0
Location: /usr/local/lib/python3.7/dist-packages
Requires: click, joblib, regex, tqdm
Required-by: textblob


In [7]:
!python --version

Python 3.7.13


In [8]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [9]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pandas

from sklearn.decomposition import LatentDirichletAllocation

from warnings import filterwarnings
filterwarnings('ignore')

In [10]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
nlp=spacy.load('en_core_web_sm')

In [12]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'
print ('D1: ',D1,'\nD2: ',D2,'\nD3: ',D3,'\nD4: ',D4,'\nD5: ',D5, end = "\n",)

# Combine all the documents into a list:
corpus = [D1, D2, D3, D4, D5]
print( "Corpus: ", corpus)

D1:  I want to watch a movie this weekend. 
D2:  I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton. 
D3:  I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch. 
D4:  Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long! 
D5:  This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.
Corpus:  ['I want to watch a movie this weekend.', 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.', 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.', 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!', 'This blueberry milks

## 2. Text Preprocessing

In [13]:
stop_loss_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


def clean_data(doc):
  # Convert text into lower case and split into words
  stop_free_word = " ".join([i for i in doc.lower().split() if i not in stop_loss_words])

  # Remove stop words if present
  remove_stop_words = ''.join(ch for ch in stop_free_word if ch not in exclude)  

  # Remove punctuations, symbols and special characters and normalize the text
  normalize_text = " ".join(lemma.lemmatize(word) for word in remove_stop_words.split())  
  return normalize_text

# Clean data is stored in a new list
clean_corpus = [clean_data(doc).split() for doc in corpus]
print("Clean corpus: ", clean_corpus)

Clean corpus:  [['want', 'watch', 'movie', 'weekend'], ['went', 'shopping', 'yesterday', 'new', 'zealand', 'world', 'test', 'championship', 'beating', 'india', 'eight', 'wicket', 'southampton'], ['don’t', 'watch', 'cricket', 'netflix', 'amazon', 'prime', 'good', 'movie', 'watch'], ['movie', 'nice', 'way', 'chill', 'however', 'time', 'would', 'like', 'paint', 'read', 'good', 'book', 'it’s', 'long'], ['blueberry', 'milkshake', 'good', 'try', 'reading', 'dr', 'joe', 'dispenza’s', 'book', 'work', 'gamechanger', 'book', 'helped', 'learn', 'much', 'thought', 'impact', 'biology', 'rewire', 'brain']]


## 3. Convert Text into Numerical Representation

In [14]:
# Converting text into numerical representation using tf-idf vectorizer
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False) 
print('TF-IDF Vectorizer: ',tf_idf_vectorizer)
# Converting text into numerical representation using count vectorizer
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
print('Count Vectorizer: ',cv_vectorizer)

TF-IDF Vectorizer:  TfidfVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f68e2033950>)
Count Vectorizer:  CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f68e68437a0>)


In [15]:
# Array from TF-IDF Vectorizer 
tf_idf_array = tf_idf_vectorizer.fit_transform(clean_corpus)
print(tf_idf_array)

  (0, 44)	0.5680140774328015
  (0, 25)	0.38040564760664297
  (0, 42)	0.45827018116532225
  (0, 41)	0.5680140774328015
  (1, 36)	0.2773500981126146
  (1, 46)	0.2773500981126146
  (1, 12)	0.2773500981126146
  (1, 18)	0.2773500981126146
  (1, 1)	0.2773500981126146
  (1, 6)	0.2773500981126146
  (1, 37)	0.2773500981126146
  (1, 48)	0.2773500981126146
  (1, 51)	0.2773500981126146
  (1, 28)	0.2773500981126146
  (1, 50)	0.2773500981126146
  (1, 35)	0.2773500981126146
  (1, 45)	0.2773500981126146
  (2, 14)	0.22969985625059522
  (2, 31)	0.34298321477483373
  (2, 0)	0.34298321477483373
  (2, 27)	0.34298321477483373
  (2, 8)	0.34298321477483373
  (2, 10)	0.34298321477483373
  (2, 25)	0.22969985625059522
  (2, 42)	0.5534333961648077
  :	:
  (3, 16)	0.2823018492676415
  (3, 7)	0.2823018492676415
  (3, 43)	0.2823018492676415
  (3, 29)	0.2823018492676415
  (3, 14)	0.18906083855626746
  (3, 25)	0.18906083855626746
  (4, 5)	0.22331568324278164
  (4, 34)	0.22331568324278164
  (4, 2)	0.22331568324278164
 

In [16]:
tf_idf_array

<5x52 sparse matrix of type '<class 'numpy.float64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [17]:
# Array from Count Vectorizer 
cv_array = cv_vectorizer.fit_transform(clean_corpus)
print(cv_array)

  (0, 41)	1
  (0, 42)	1
  (0, 25)	1
  (0, 44)	1
  (1, 45)	1
  (1, 35)	1
  (1, 50)	1
  (1, 28)	1
  (1, 51)	1
  (1, 48)	1
  (1, 37)	1
  (1, 6)	1
  (1, 1)	1
  (1, 18)	1
  (1, 12)	1
  (1, 46)	1
  (1, 36)	1
  (2, 42)	2
  (2, 25)	1
  (2, 10)	1
  (2, 8)	1
  (2, 27)	1
  (2, 0)	1
  (2, 31)	1
  (2, 14)	1
  :	:
  (3, 22)	1
  (3, 30)	1
  (3, 32)	1
  (3, 4)	1
  (3, 19)	1
  (3, 23)	1
  (4, 14)	1
  (4, 4)	2
  (4, 3)	1
  (4, 24)	1
  (4, 40)	1
  (4, 33)	1
  (4, 11)	1
  (4, 20)	1
  (4, 9)	1
  (4, 47)	1
  (4, 13)	1
  (4, 15)	1
  (4, 21)	1
  (4, 26)	1
  (4, 38)	1
  (4, 17)	1
  (4, 2)	1
  (4, 34)	1
  (4, 5)	1


In [18]:
cv_array

<5x52 sparse matrix of type '<class 'numpy.int64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [19]:
# Creating vocabulary array from tf-idf
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()
print(vocab_tf_idf)

['amazon', 'beating', 'biology', 'blueberry', 'book', 'brain', 'championship', 'chill', 'cricket', 'dispenza’s', 'don’t', 'dr', 'eight', 'gamechanger', 'good', 'helped', 'however', 'impact', 'india', 'it’s', 'joe', 'learn', 'like', 'long', 'milkshake', 'movie', 'much', 'netflix', 'new', 'nice', 'paint', 'prime', 'read', 'reading', 'rewire', 'shopping', 'southampton', 'test', 'thought', 'time', 'try', 'want', 'watch', 'way', 'weekend', 'went', 'wicket', 'work', 'world', 'would', 'yesterday', 'zealand']


In [20]:
vocab_tf_idf

['amazon',
 'beating',
 'biology',
 'blueberry',
 'book',
 'brain',
 'championship',
 'chill',
 'cricket',
 'dispenza’s',
 'don’t',
 'dr',
 'eight',
 'gamechanger',
 'good',
 'helped',
 'however',
 'impact',
 'india',
 'it’s',
 'joe',
 'learn',
 'like',
 'long',
 'milkshake',
 'movie',
 'much',
 'netflix',
 'new',
 'nice',
 'paint',
 'prime',
 'read',
 'reading',
 'rewire',
 'shopping',
 'southampton',
 'test',
 'thought',
 'time',
 'try',
 'want',
 'watch',
 'way',
 'weekend',
 'went',
 'wicket',
 'work',
 'world',
 'would',
 'yesterday',
 'zealand']

In [21]:
# Creating vocabulary array from cv
vocab_cv = cv_vectorizer.get_feature_names()
print(vocab_cv)

['amazon', 'beating', 'biology', 'blueberry', 'book', 'brain', 'championship', 'chill', 'cricket', 'dispenza’s', 'don’t', 'dr', 'eight', 'gamechanger', 'good', 'helped', 'however', 'impact', 'india', 'it’s', 'joe', 'learn', 'like', 'long', 'milkshake', 'movie', 'much', 'netflix', 'new', 'nice', 'paint', 'prime', 'read', 'reading', 'rewire', 'shopping', 'southampton', 'test', 'thought', 'time', 'try', 'want', 'watch', 'way', 'weekend', 'went', 'wicket', 'work', 'world', 'would', 'yesterday', 'zealand']


In [22]:
vocab_cv

['amazon',
 'beating',
 'biology',
 'blueberry',
 'book',
 'brain',
 'championship',
 'chill',
 'cricket',
 'dispenza’s',
 'don’t',
 'dr',
 'eight',
 'gamechanger',
 'good',
 'helped',
 'however',
 'impact',
 'india',
 'it’s',
 'joe',
 'learn',
 'like',
 'long',
 'milkshake',
 'movie',
 'much',
 'netflix',
 'new',
 'nice',
 'paint',
 'prime',
 'read',
 'reading',
 'rewire',
 'shopping',
 'southampton',
 'test',
 'thought',
 'time',
 'try',
 'want',
 'watch',
 'way',
 'weekend',
 'went',
 'wicket',
 'work',
 'world',
 'would',
 'yesterday',
 'zealand']

In [23]:
display("Length of vocabulary array using tf_idf: ", len(vocab_tf_idf))
display("Length of vocabulary array using cv: ",len(vocab_cv))

'Length of vocabulary array using tf_idf: '

52

'Length of vocabulary array using cv: '

52

## 4. LDA Algorithm

In [24]:
# Create object for the LDA class 
lda_algorithm = LatentDirichletAllocation(n_components = 6, max_iter = 20, random_state = 20)
print("LDA Algorithm : ",lda_algorithm)
# fit transform on model on our tf_idf_vectorizer
X_topics = lda_algorithm.fit_transform(tf_idf_array)
print("X Topics : ",X_topics)

# .components_ gives us our topic distribution 
topic_words = lda_algorithm.components_
print( 'Topic Words : ', topic_words)

LDA Algorithm :  LatentDirichletAllocation(max_iter=20, n_components=6, random_state=20)
X Topics :  [[0.05603663 0.05603398 0.71937475 0.056434   0.05603663 0.05608399]
 [0.03619889 0.81901639 0.03619418 0.03619616 0.03619889 0.03619549]
 [0.04471891 0.0447161  0.04487445 0.77619722 0.04471891 0.04477441]
 [0.03538801 0.03538454 0.03550291 0.03544894 0.03538801 0.82288759]
 [0.03142519 0.03142035 0.84281408 0.03145543 0.03142519 0.03145977]]
Topic Words :  [[0.16667075 0.1666704  0.16667106 0.16667106 0.16667403 0.16667106
  0.1666704  0.16667027 0.16667075 0.16667106 0.16667075 0.16667106
  0.1666704  0.16667106 0.16668256 0.16667106 0.16667027 0.16667106
  0.1666704  0.16667027 0.16667106 0.16667106 0.16667027 0.16667027
  0.16667106 0.16668412 0.16667106 0.16667075 0.1666704  0.16667027
  0.16667027 0.16667075 0.16667027 0.16667106 0.16667106 0.1666704
  0.1666704  0.1666704  0.16667106 0.16667027 0.16667106 0.16667255
  0.16667636 0.16667027 0.16667255 0.1666704  0.1666704  0.1666

## 4.1) Retrieve the Topics

In [25]:
# Initialize the number of words 
n_top_words = 5
for i, topic_list in enumerate (topic_words):

  # Sorting an array or a list or the matrix according to their values
  sorted_topic_list = np.argsort(topic_list)

  # View the actual words present in those indexes
  topic_words = np.array(vocab_tf_idf)[sorted_topic_list]

  # topic_words variable contains the Topics and respective words present in those Topics
  topic_words = topic_words[:-n_top_words:-1]

  print ("Topic", str(i+1), topic_words)

Topic 1 ['movie' 'good' 'watch' 'book']
Topic 2 ['zealand' 'test' 'beating' 'world']
Topic 3 ['weekend' 'want' 'watch' 'movie']
Topic 4 ['watch' 'amazon' 'cricket' 'don’t']
Topic 5 ['movie' 'good' 'watch' 'book']
Topic 6 ['however' 'chill' 'would' 'it’s']


### 4.2) Annotate the Topic documents

In [26]:
document_topic = lda_algorithm.transform(tf_idf_array)

for l in range(document_topic.shape[0]):
  topic_document = document_topic[l].argmax()

  print(" Document ", l+1, " --> Topic : ",topic_document )

 Document  1  --> Topic :  2
 Document  2  --> Topic :  1
 Document  3  --> Topic :  3
 Document  4  --> Topic :  5
 Document  5  --> Topic :  2
