# topic modeling
LDA(latent dirichlet allocation)

NMF(non negative matrix factorization)

# LDA

In [None]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
documnets=[
    "machine learning is amazing for predictive analysis.",
    "deep learning and neural network are a subset of machine learning.",
    "natural language processing is a part of ai.",
    "ai includes vision,robotics,and natural understanding.",
   "generative ai like chatgpt creates human like text.",
]

In [None]:
# preprocessing
stop_words=set(stopwords.words('english'))
processed_doc=[
    [word.lower() for word in word_tokenize(doc) if word.isalnum() and word.lower() not in stop_words]
    for doc in documnets
]

In [None]:
processed_doc

[['machine', 'learning', 'amazing', 'predictive', 'analysis'],
 ['deep', 'learning', 'neural', 'network', 'subset', 'machine', 'learning'],
 ['natural', 'language', 'processing', 'part', 'ai'],
 ['ai', 'includes', 'vision', 'robotics', 'natural', 'understanding'],
 ['generative', 'ai', 'like', 'chatgpt', 'creates', 'human', 'like', 'text']]

In [None]:
# creating dictionary and corpus
dictionary=Dictionary(processed_doc) # converts each of words and unique integer IDs.
corpus=[dictionary.doc2bow(doc) for doc in processed_doc]

In [None]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f1c9fbbacd0>

In [None]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(2, 2), (3, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(9, 1), (11, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(9, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1)]]

In [None]:
# LDA model
lda_model=LdaModel(corpus=corpus,id2word=dictionary,num_topics=3,passes=10)

In [None]:
# display topics
topics=lda_model.print_topics(num_words=5)
for topic in topics:
    print(f'Topic {topics.index(topic)}: {topic}')

Topic 0: (0, '0.123*"natural" + 0.123*"ai" + 0.070*"processing" + 0.070*"language" + 0.070*"includes"')
Topic 1: (1, '0.166*"learning" + 0.117*"machine" + 0.067*"amazing" + 0.067*"analysis" + 0.067*"predictive"')
Topic 2: (2, '0.145*"like" + 0.084*"ai" + 0.083*"generative" + 0.083*"chatgpt" + 0.083*"text"')


# NMF

nmf topic modeling with scikit learn

nmf non negative matrix factorization

# key features
non negative constraint

factorization

objective function

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
documnets=[
    "machine learning is amazing for predictive analysis.",
    "deep learning and neural network are a subset of machine learning.",
    "natural language processing is a part of ai.",
    "ai includes vision,robotics,and natural understanding.",
   "generative ai like chatgpt creates human like text.",
]

In [None]:
# tf-tdf vectorization
vectorizer=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')
tfidf_matrix=vectorizer.fit_transform(documnets)

In [None]:
# nmf model
nmf_model=NMF(n_components=3,random_state=42)
w=nmf_model.fit(tfidf_matrix)
h=nmf_model.components_
h

array([[1.22800047, 0.        , 0.        , 2.6085111 ],
       [0.        , 1.4518495 , 1.04643391, 0.        ],
       [0.92880534, 0.        , 0.        , 0.        ]])

In [None]:
# display topics
feature_names=vectorizer.get_feature_names_out()
for topic_idx,topic in enumerate(h):
    print(f"topic {topic_idx}:{''.join([feature_names[i] for i in topic.argsort()[-5:]])}")

topic 0:learningmachineainatural
topic 1:ainaturalmachinelearning
topic 2:learningmachinenaturalai


In [None]:
feature_names

array(['ai', 'learning', 'machine', 'natural'], dtype=object)