
## Topic Modelling using LDA, LSA, and NMF
## Aim: To identify the  topics that best describes a set of documents.
## Dataset: Newspaper Articles




In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
doc1 = "Budget 2023: Tax exemption removed in insurance policies with premium over Rs 5 lakh"
doc2 = "Only humans can do what other humans have never done before.Creativity & ChatGPT - The only engineering course"
doc3 = "ChatGPT maker OpenAI releases tool to identify AI-written text.It can mislabel both AI-generated and human-written text, and it can also be evaded with minor edits.OpenAI's AI Text Classifier can help to detect AI-generated content, but it is not 100% accurate and can make mistakes."
doc4 = "In Budget Health allocation puts focus on pharma research, collaborative R&D at ICMR labs. Microsoft Research Proposes BioGPT: A Domain-Specific Generative Transformer Language Model Pre-Trained on Large-Scale Biomedical Literature.With recent technological breakthroughs, researchers have started employing several machine learning techniques on the abundance of biomedical data that is available. Using techniques like text mining and knowledge extraction on biomedical literature has been demonstrated to be crucial in developing new medications, clinical therapy, pathology research, etc."
doc5 = "The Kannada Movie was really good. The reviews for the moview were quite good.The 2023 Budget was balanced and prudnet with no negatives and surprises."
doc6 = "This was one of the best books I had read. The book was the best seller.  Most of the movies released on Netflix and Amazon are good to watch."

Budget
movie
technology

In [None]:
corpus = [doc1, doc2, doc3, doc4, doc5, doc6]

In [None]:
cleaned_data = []

lemma =WordNetLemmatizer()
stopwords =stopwords.words("english")
for text in corpus:
  text = re.sub(r"https\S+", "", text) # removing links
  text = re.sub("[^a-zA-Z0-9]", " ", text) # including only alphabets and numericals
  text = nltk.word_tokenize(text.lower()) # tokenization
  text = [lemma.lemmatize(word) for word in text] # lemmatization
  text = [word for word in text if word not in stopwords] # stopwords
  text = " ".join(text)
  cleaned_data.append(text)

In [None]:
cleaned_data

['budget 2023 tax exemption removed insurance policy premium r 5 lakh',
 'human human never done creativity chatgpt engineering course',
 'chatgpt maker openai release tool identify ai written text mislabel ai generated human written text also evaded minor edits openai ai text classifier help detect ai generated content 100 accurate make mistake',
 'budget health allocation put focus pharma research collaborative r icmr lab microsoft research proposes biogpt domain specific generative transformer language model pre trained large scale biomedical literature recent technological breakthrough researcher started employing several machine learning technique abundance biomedical data available using technique like text mining knowledge extraction biomedical literature ha demonstrated crucial developing new medication clinical therapy pathology research etc',
 'kannada movie wa really good review moview quite good 2023 budget wa balanced prudnet negative surprise',
 'wa one best book read boo

In [None]:
frequent_words =nltk.FreqDist(word for text in cleaned_data for word in text.lower().split())
frequent_words

FreqDist({'ai': 4, 'text': 4, 'wa': 4, 'budget': 3, 'human': 3, 'research': 3, 'biomedical': 3, 'good': 3, '2023': 2, 'r': 2, ...})

Implementing LDA using Sklearn

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=50)
tfidf_vectors = tfidf_vectorizer.fit_transform(cleaned_data)

In [None]:
vocab_tfidf = tfidf_vectorizer.get_feature_names_out()
vocab_tfidf

In [None]:
lda_model = LatentDirichletAllocation(n_components = 3, max_iter = 20)
topics1 = lda_model.fit_transform(tfidf_vectors)
topic1_words = lda_model.components_
topic1_words

array([[0.5649975 , 0.96050474, 0.33451166, 0.33406526, 0.33451166,
        0.52628774, 0.45731556, 0.64573686, 0.80773644, 0.44796189,
        0.33407779, 0.48853981, 0.48853981, 0.48853981, 0.33402993,
        0.56764122, 0.6173765 , 0.6173765 , 0.33453815, 0.33492572,
        0.33402993, 0.33453815, 0.64573686, 0.33402993, 0.33402993,
        0.33465832, 0.33402993, 0.33465832, 0.33402993, 0.6173765 ,
        0.6173765 , 0.48853981, 0.33406526, 0.33402993, 0.33453815,
        0.33402993, 0.33402993, 0.33402993, 0.6173765 , 0.33465832,
        0.33407779, 0.33402993, 0.72134068, 0.33402993, 0.48853981,
        0.33402993, 0.33402993, 0.33402993, 0.80332501, 0.64573686],
       [0.73635882, 0.33424455, 0.33426657, 0.79391318, 0.33426657,
        0.7826417 , 0.33566313, 0.33428495, 0.33527925, 0.33555508,
        0.63986215, 0.33422689, 0.33422689, 0.33422689, 0.48593461,
        0.3352933 , 0.33438336, 0.33438336, 0.3342875 , 0.33483383,
        0.48593461, 0.3342875 , 0.33428495, 0.4

In [None]:
for i ,topic in enumerate(topics1[0]):
  print("Topic: ", i, "-", topic*100, "%")

Topic:  0 - 10.535887966614272 %
Topic:  1 - 78.73557778656493 %
Topic:  2 - 10.728534246820807 %


In [None]:
n_words_in_each_topic = 6
for i,  topic_dist in enumerate(topic1_words):
  sorted_topic_dist = np.argsort(topic_dist)
  topic_words = np.array(vocab_tfidf)[sorted_topic_dist]
  topic_words = topic_words[:-n_words_in_each_topic:-1]
  print(topic_words)

['ai' 'good' 'wa' 'text' 'written']
['policy' 'tax' 'premium' 'biomedical' 'research']
['human' 'best' 'book' 'never' 'wa']


In [None]:
doc_to_topics = lda_model.transform(tfidf_vectors)
for n in range(doc_to_topics.shape[0]):
  topic_doc = doc_to_topics[n].argmax()
  print("Document:" , n+1, "--> Topic", topic_doc)

Document: 1 --> Topic 1
Document: 2 --> Topic 2
Document: 3 --> Topic 0
Document: 4 --> Topic 1
Document: 5 --> Topic 0
Document: 6 --> Topic 2


# Conclusion:
###  The given corpus consists of news-articles belonging to Economy, Technology and Entertainment. The Topic 0 as modelled by the algorithm consists of technology related words, Topic 1 relates to Economy and Topic 2 relates to mixture of both. Thus, each of the 6 documents have been assigned to any of the three topics.  The document 1 is related to economy and the topic assigned to it is also economy.
