# Mount the Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!python -m pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [3]:
!python -m pip install --user spacy==3.1.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

## 1. Import the required libraries

In [4]:
!python -m pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [5]:
import re
import spacy

import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [6]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

import numpy as np
import pandas as pd

import gensim
from gensim import corpora

from warnings import filterwarnings
filterwarnings('ignore')

In [7]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'
print ('D1: ',D1,'\nD2: ',D2,'\nD3: ',D3,'\nD4: ',D4,'\nD5: ',D5, end = "\n",)
# Combine all the documents into a list:
corpus = [D1, D2, D3, D4, D5]
print( "Corpus: ", corpus)

D1:  I want to watch a movie this weekend. 
D2:  I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton. 
D3:  I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch. 
D4:  Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long! 
D5:  This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.
Corpus:  ['I want to watch a movie this weekend.', 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.', 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.', 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!', 'This blueberry milks

## 2. Text Preprocessing

In [10]:
stop_loss_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


def clean_data(doc):
  # Convert text into lower case and split into words
  stop_free_word = " ".join([i for i in doc.lower().split() if i not in stop_loss_words])

  # Remove stop words if present
  remove_stop_words = ''.join(ch for ch in stop_free_word if ch not in exclude)  

  # Remove punctuations, symbols and special characters and normalize the text
  normalize_text = " ".join(lemma.lemmatize(word) for word in remove_stop_words.split())  
  return normalize_text

# Clean data is stored in a new list
clean_corpus = [clean_data(doc).split() for doc in corpus]
print("Clean corpus: ", clean_corpus)

Clean corpus:  [['want', 'watch', 'movie', 'weekend'], ['went', 'shopping', 'yesterday', 'new', 'zealand', 'world', 'test', 'championship', 'beating', 'india', 'eight', 'wicket', 'southampton'], ['don’t', 'watch', 'cricket', 'netflix', 'amazon', 'prime', 'good', 'movie', 'watch'], ['movie', 'nice', 'way', 'chill', 'however', 'time', 'would', 'like', 'paint', 'read', 'good', 'book', 'it’s', 'long'], ['blueberry', 'milkshake', 'good', 'try', 'reading', 'dr', 'joe', 'dispenza’s', 'book', 'work', 'gamechanger', 'book', 'helped', 'learn', 'much', 'thought', 'impact', 'biology', 'rewire', 'brain']]


## 3. Creating Document Term Matrix

In [11]:
dict_ = corpora.Dictionary(clean_corpus)
print(dict_)

Dictionary(52 unique tokens: ['movie', 'want', 'watch', 'weekend', 'beating']...)


In [12]:
for a in dict_.values():
  print(a)

movie
want
watch
weekend
beating
championship
eight
india
new
shopping
southampton
test
went
wicket
world
yesterday
zealand
amazon
cricket
don’t
good
netflix
prime
book
chill
however
it’s
like
long
nice
paint
read
time
way
would
biology
blueberry
brain
dispenza’s
dr
gamechanger
helped
impact
joe
learn
milkshake
much
reading
rewire
thought
try
work


In [12]:
# Converting the corpus into Document Term Matrix using the dictionary 
document_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
document_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(0, 1), (2, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)],
 [(0, 1),
  (20, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1)],
 [(20, 1),
  (23, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1)]]

## 4. LDA Algorithm

In [13]:
lda_algorithm = gensim.models.ldamodel.LdaModel

In [14]:
ldamodel = lda_algorithm(document_term_matrix, num_topics=6, 
                         id2word = dict_, passes=1, random_state=0, eval_every=None)


In [15]:
ldamodel.print_topics()

[(0,
  '0.136*"watch" + 0.084*"movie" + 0.060*"good" + 0.060*"amazon" + 0.060*"netflix" + 0.060*"prime" + 0.060*"cricket" + 0.060*"don’t" + 0.029*"want" + 0.027*"weekend"'),
 (1,
  '0.074*"weekend" + 0.070*"want" + 0.065*"movie" + 0.063*"watch" + 0.015*"don’t" + 0.015*"good" + 0.015*"book" + 0.015*"cricket" + 0.015*"prime" + 0.015*"new"'),
 (2,
  '0.052*"book" + 0.028*"good" + 0.028*"blueberry" + 0.028*"try" + 0.028*"helped" + 0.028*"reading" + 0.028*"joe" + 0.028*"work" + 0.028*"dispenza’s" + 0.028*"thought"'),
 (3,
  '0.020*"championship" + 0.020*"wicket" + 0.020*"southampton" + 0.020*"yesterday" + 0.020*"went" + 0.020*"india" + 0.020*"shopping" + 0.020*"new" + 0.020*"world" + 0.020*"zealand"'),
 (4,
  '0.051*"movie" + 0.051*"book" + 0.051*"paint" + 0.051*"long" + 0.051*"like" + 0.051*"read" + 0.051*"time" + 0.051*"chill" + 0.051*"would" + 0.051*"however"'),
 (5,
  '0.019*"weekend" + 0.019*"watch" + 0.019*"movie" + 0.019*"good" + 0.019*"want" + 0.019*"don’t" + 0.019*"book" + 0.019*"c

### 4.1) Extracting Topics from the Corpus

In [16]:
print(ldamodel.print_topics(num_topics=6, num_words=5))

[(0, '0.136*"watch" + 0.084*"movie" + 0.060*"good" + 0.060*"amazon" + 0.060*"netflix"'), (1, '0.074*"weekend" + 0.070*"want" + 0.065*"movie" + 0.063*"watch" + 0.015*"don’t"'), (2, '0.052*"book" + 0.028*"good" + 0.028*"blueberry" + 0.028*"try" + 0.028*"helped"'), (3, '0.020*"championship" + 0.020*"wicket" + 0.020*"southampton" + 0.020*"yesterday" + 0.020*"went"'), (4, '0.051*"movie" + 0.051*"book" + 0.051*"paint" + 0.051*"long" + 0.051*"like"'), (5, '0.019*"weekend" + 0.019*"watch" + 0.019*"movie" + 0.019*"good" + 0.019*"want"')]


### 4.2) Assigning the topics to the documents

In [18]:
counter = 0
for i in ldamodel[document_term_matrix]:
  print("Document : ", counter, i)
  counter += 1

Document :  0 [(0, 0.2822866), (1, 0.5842826), (2, 0.03333387), (3, 0.033336017), (4, 0.03342482), (5, 0.03333605)]
Document :  1 [(0, 0.011905481), (1, 0.011906071), (2, 0.94046956), (3, 0.011907104), (4, 0.011905371), (5, 0.011906449)]
Document :  2 [(0, 0.91660607), (1, 0.016687103), (2, 0.016676264), (3, 0.016667519), (4, 0.016695492), (5, 0.016667534)]
Document :  3 [(0, 0.011138659), (1, 0.011119502), (2, 0.011127129), (3, 0.011111923), (4, 0.9443909), (5, 0.011111934)]
Document :  4 [(2, 0.9602895)]
