# Topic Modelling using LSA and NMF

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
doc1 = "Budget 2023: Tax exemption removed in insurance policies with premium over Rs 5 lakh"
doc2 = "Only humans can do what other humans have never done before.Creativity & ChatGPT - The only engineering course"
doc3 = "ChatGPT maker OpenAI releases tool to identify AI-written text.It can mislabel both AI-generated and human-written text, and it can also be evaded with minor edits.OpenAI's AI Text Classifier can help to detect AI-generated content, but it is not 100% accurate and can make mistakes."
doc4 = "In Budget Health allocation puts focus on pharma research, collaborative R&D at ICMR labs. Microsoft Research Proposes BioGPT: A Domain-Specific Generative Transformer Language Model Pre-Trained on Large-Scale Biomedical Literature.With recent technological breakthroughs, researchers have started employing several machine learning techniques on the abundance of biomedical data that is available. Using techniques like text mining and knowledge extraction on biomedical literature has been demonstrated to be crucial in developing new medications, clinical therapy, pathology research, etc."
doc5 = "The Kannada Movie was really good. The reviews for the moview were quite good.The 2023 Budget was balanced and prudnet with no negatives and surprises."
doc6 = "This was one of the best books I had read. The book was the best seller.  Most of the movies released on Netflix and Amazon are good to watch."

In [None]:
corpus = [doc1, doc2, doc3, doc4, doc5, doc6]

In [None]:
cleaned_data = []

lemma =WordNetLemmatizer()
stopwords =stopwords.words("english")
for text in corpus:
  text = re.sub(r"https\S+", "", text) # removing links
  text = re.sub("[^a-zA-Z0-9]", " ", text) # including only alphabets and numericals
  text = nltk.word_tokenize(text.lower()) # tokenization
  text = [lemma.lemmatize(word) for word in text] # lemmatization
  text = [word for word in text if word not in stopwords] # stopwords
  text = " ".join(text)
  cleaned_data.append(text)

In [None]:
cleaned_data

['budget 2023 tax exemption removed insurance policy premium r 5 lakh',
 'human human never done creativity chatgpt engineering course',
 'chatgpt maker openai release tool identify ai written text mislabel ai generated human written text also evaded minor edits openai ai text classifier help detect ai generated content 100 accurate make mistake',
 'budget health allocation put focus pharma research collaborative r icmr lab microsoft research proposes biogpt domain specific generative transformer language model pre trained large scale biomedical literature recent technological breakthrough researcher started employing several machine learning technique abundance biomedical data available using technique like text mining knowledge extraction biomedical literature ha demonstrated crucial developing new medication clinical therapy pathology research etc',
 'kannada movie wa really good review moview quite good 2023 budget wa balanced prudnet negative surprise',
 'wa one best book read boo

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=50)
tfidf_vectors = tfidf_vectorizer.fit_transform(cleaned_data)
tfidf_vectors

<6x50 sparse matrix of type '<class 'numpy.float64'>'
	with 59 stored elements in Compressed Sparse Row format>

In [None]:
vocab_tfidf = tfidf_vectorizer.get_feature_names_out()
vocab_tfidf

array(['2023', 'ai', 'best', 'biomedical', 'book', 'budget', 'chatgpt',
       'generated', 'good', 'human', 'literature', 'minor', 'mislabel',
       'mistake', 'model', 'movie', 'moview', 'negative', 'netflix',
       'never', 'new', 'one', 'openai', 'pathology', 'pharma', 'policy',
       'pre', 'premium', 'proposes', 'prudnet', 'quite', 'release',
       'research', 'scale', 'seller', 'several', 'specific', 'started',
       'surprise', 'tax', 'technique', 'technological', 'text', 'therapy',
       'tool', 'trained', 'transformer', 'using', 'wa', 'written'],
      dtype=object)

In [None]:
vocab_tfidf.shape

(50,)

In [None]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=3,n_iter = 10)
lsa_model

TruncatedSVD(n_components=3, n_iter=10)

In [None]:
lsa_topics = lsa_model.fit_transform(tfidf_vectors)
lsa_topics

array([[ 0.35653114,  0.03564047,  0.64059766],
       [ 0.00478586,  0.722693  , -0.22474923],
       [ 0.01212778,  0.76003658, -0.04494174],
       [ 0.0806577 ,  0.23816777,  0.67071025],
       [ 0.8304777 , -0.01812772, -0.02987864],
       [ 0.74940539, -0.03941603, -0.3416799 ]])

In [None]:
print("Document 1")
for i ,topic in enumerate(lsa_topics[0]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 2")
for i ,topic in enumerate(lsa_topics[1]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 3")
for i ,topic in enumerate(lsa_topics[2]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 4")
for i ,topic in enumerate(lsa_topics[3]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 5")
for i ,topic in enumerate(lsa_topics[4]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 6")
for i ,topic in enumerate(lsa_topics[5]):
  print("Topic: ", i, "-", topic*100, "%")

Document 1
Topic:  0 - 35.65311405818915 %
Topic:  1 - 3.5640465490021653 %
Topic:  2 - 64.0597662372234 %

Document 2
Topic:  0 - 0.47858564929111647 %
Topic:  1 - 72.26930027876217 %
Topic:  2 - -22.47492261538427 %

Document 3
Topic:  0 - 1.2127776004955098 %
Topic:  1 - 76.0036579026904 %
Topic:  2 - -4.494174147136258 %

Document 4
Topic:  0 - 8.0657696932792 %
Topic:  1 - 23.81677690333808 %
Topic:  2 - 67.07102480865753 %

Document 5
Topic:  0 - 83.04776973399164 %
Topic:  1 - -1.812772231334458 %
Topic:  2 - -2.9878637094128564 %

Document 6
Topic:  0 - 74.9405392119326 %
Topic:  1 - -3.9416034575524486 %
Topic:  2 - -34.16798960636378 %


In [None]:
lsa_model.components_.shape

(3, 50)

In [None]:
lsa_model.components_

array([[ 0.24433545,  0.00551049,  0.27907645,  0.02690797,  0.27907645,
         0.21249435,  0.00248628,  0.00275524,  0.39591018,  0.00384288,
         0.01793864,  0.00137762,  0.00137762,  0.00137762,  0.00896932,
         0.25516676,  0.17163532,  0.17163532,  0.13953822,  0.00165437,
         0.00896932,  0.13953822,  0.00275524,  0.00896932,  0.00896932,
         0.12632954,  0.00896932,  0.12632954,  0.00896932,  0.17163532,
         0.17163532,  0.00137762,  0.02690797,  0.00896932,  0.13953822,
         0.00896932,  0.00896932,  0.00896932,  0.17163532,  0.12632954,
         0.01793864,  0.00896932,  0.01074398,  0.00896932,  0.00137762,
         0.00896932,  0.00896932,  0.00896932,  0.51033352,  0.00275524],
       [ 0.00869804,  0.4124129 , -0.01752945,  0.09488708, -0.01752945,
         0.02924065,  0.32919154,  0.20620645, -0.01452494,  0.57383695,
         0.06325805,  0.10310323,  0.10310323,  0.10310323,  0.03162903,
        -0.01085607, -0.00447415, -0.00447415, -0.

In [None]:
n_words_in_each_topic = 6
for i,  topic_dist in enumerate(lsa_model.components_):
  sorted_topic_dist = np.argsort(topic_dist)
  topic_words = np.array(vocab_tfidf)[sorted_topic_dist]
  #topic_words = topic_words[:-n_words_in_each_topic:-1]
  print(topic_words)

['minor' 'release' 'mistake' 'mislabel' 'tool' 'never' 'chatgpt' 'written'
 'generated' 'openai' 'human' 'ai' 'therapy' 'technological' 'pre'
 'trained' 'transformer' 'started' 'specific' 'several' 'scale' 'using'
 'proposes' 'pharma' 'new' 'model' 'pathology' 'text' 'technique'
 'literature' 'research' 'biomedical' 'policy' 'premium' 'tax' 'seller'
 'netflix' 'one' 'quite' 'surprise' 'prudnet' 'moview' 'negative' 'budget'
 '2023' 'movie' 'best' 'book' 'good' 'wa']
['wa' 'book' 'best' 'good' 'movie' 'one' 'seller' 'netflix' 'prudnet'
 'quite' 'surprise' 'negative' 'moview' '2023' 'policy' 'premium' 'tax'
 'budget' 'proposes' 'started' 'technological' 'scale' 'therapy' 'trained'
 'transformer' 'using' 'several' 'specific' 'pharma' 'pathology' 'pre'
 'model' 'new' 'technique' 'literature' 'research' 'biomedical' 'mistake'
 'mislabel' 'minor' 'release' 'tool' 'openai' 'generated' 'written' 'text'
 'never' 'chatgpt' 'ai' 'human']
['human' 'best' 'book' 'wa' 'never' 'chatgpt' 'seller' 'one'

In [None]:
doc_to_topics = lsa_model.transform(tfidf_vectors)
for n in range(doc_to_topics.shape[0]):
  topic_doc = doc_to_topics[n].argmax()
  print("Document:" , n+1, "--> Topic", topic_doc)

Document: 1 --> Topic 2
Document: 2 --> Topic 1
Document: 3 --> Topic 1
Document: 4 --> Topic 2
Document: 5 --> Topic 0
Document: 6 --> Topic 0


# Non-Negative Matrix Factorization

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf_model =NMF(n_components=3)
nmf_model

NMF(n_components=3)

In [None]:
nmf_topics = nmf_model.fit_transform(tfidf_vectors)
nmf_topics



array([[2.89146547e-002, 0.00000000e+000, 7.64162921e-001],
       [3.12344427e-103, 7.23390848e-001, 0.00000000e+000],
       [0.00000000e+000, 7.21872960e-001, 1.79077870e-002],
       [0.00000000e+000, 1.55321286e-002, 7.37613604e-001],
       [7.30416901e-001, 0.00000000e+000, 8.16698977e-002],
       [7.50637503e-001, 0.00000000e+000, 0.00000000e+000]])

In [None]:
print("Document 1")
for i ,topic in enumerate(nmf_topics[0]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 2")
for i ,topic in enumerate(nmf_topics[1]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 3")
for i ,topic in enumerate(nmf_topics[2]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 4")
for i ,topic in enumerate(nmf_topics[3]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 5")
for i ,topic in enumerate(nmf_topics[4]):
  print("Topic: ", i, "-", topic*100, "%")
print("\nDocument 6")
for i ,topic in enumerate(nmf_topics[5]):
  print("Topic: ", i, "-", topic*100, "%")

Document 1
Topic:  0 - 2.8914654691891775 %
Topic:  1 - 0.0 %
Topic:  2 - 76.41629206429705 %

Document 2
Topic:  0 - 3.123444268123779e-101 %
Topic:  1 - 72.33908475724385 %
Topic:  2 - 0.0 %

Document 3
Topic:  0 - 0.0 %
Topic:  1 - 72.18729595000072 %
Topic:  2 - 1.7907787012075504 %

Document 4
Topic:  0 - 0.0 %
Topic:  1 - 1.5532128569415156 %
Topic:  2 - 73.7613603856882 %

Document 5
Topic:  0 - 73.04169014688738 %
Topic:  1 - 0.0 %
Topic:  2 - 8.16698976564774 %

Document 6
Topic:  0 - 75.06375033269497 %
Topic:  1 - 0.0 %
Topic:  2 - 0.0 %


In [None]:
# n_words_in_each_topic = 6
for i,  topic_dist in enumerate(nmf_model.components_):
  sorted_topic_dist = np.argsort(topic_dist)
  topic_words = np.array(vocab_tfidf)[sorted_topic_dist]
  #topic_words = topic_words[:-n_words_in_each_topic:-1]
  print(topic_words)

['pharma' 'policy' 'pre' 'premium' 'proposes' 'release' 'research' 'scale'
 'several' 'specific' 'started' 'tax' 'technique' 'technological' 'text'
 'therapy' 'tool' 'trained' 'transformer' 'using' 'pathology' 'openai'
 'written' 'new' 'ai' 'biomedical' 'generated' 'literature' 'mislabel'
 'mistake' 'model' 'minor' 'chatgpt' 'never' 'human' 'budget' '2023'
 'seller' 'one' 'netflix' 'quite' 'surprise' 'negative' 'moview' 'prudnet'
 'movie' 'book' 'best' 'good' 'wa']
['2023' 'policy' 'pre' 'premium' 'proposes' 'prudnet' 'quite' 'research'
 'scale' 'seller' 'wa' 'several' 'started' 'surprise' 'tax' 'technique'
 'technological' 'therapy' 'trained' 'transformer' 'using' 'specific'
 'pathology' 'pharma' 'one' 'best' 'biomedical' 'book' 'budget' 'good'
 'model' 'literature' 'movie' 'new' 'moview' 'netflix' 'negative'
 'mistake' 'mislabel' 'minor' 'tool' 'release' 'written' 'generated'
 'openai' 'text' 'never' 'chatgpt' 'ai' 'human']
['seller' 'best' 'wa' 'book' 'chatgpt' 'one' 'never' 'human'

In [None]:
doc_to_topics = nmf_model.transform(tfidf_vectors)
for n in range(doc_to_topics.shape[0]):
  topic_doc = doc_to_topics[n].argmax()
  print("Document:" , n+1, "--> Topic", topic_doc)

Document: 1 --> Topic 2
Document: 2 --> Topic 1
Document: 3 --> Topic 1
Document: 4 --> Topic 2
Document: 5 --> Topic 0
Document: 6 --> Topic 0


### The topic which has the highest probability is the topic of that document.The Topics assigned to the documents are different from LDA and the LSA model.LSA is based on the principal componenets and the LDA is a probabilistic approach. The  results obtained from both LSA and NMF are the same.