<a href="https://colab.research.google.com/github/NDsasuke/Classification-Regression-Clustering/blob/main/Clustering/Text_Clustering_for_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import necessary libraries

In [9]:
import gensim
from gensim import corpora
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

  and should_run_async(code)


Preprocessing

In [10]:
## Download necessary nltk data
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

# Preprocess data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in documents]

# Prepare corpus
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model Building

In [11]:

# Train LDA model
lda = gensim.models.ldamodel.LdaModel

ldamodel = lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

# Print the topics
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)


  and should_run_async(code)


(0, '0.006*"window" + 0.006*"use" + 0.005*"file" + 0.005*"get" + 0.005*"one"')
(1, '0.025*"1" + 0.025*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.015*"0" + 0.015*"2" + 0.009*"3"')
(2, '0.008*"one" + 0.008*"would" + 0.006*"people" + 0.005*"think" + 0.005*"like"')
(3, '0.030*"x" + 0.005*"key" + 0.005*"file" + 0.004*"program" + 0.004*"space"')
(4, '0.020*"drive" + 0.009*"israel" + 0.007*"israeli" + 0.007*"scsi" + 0.006*"jew"')


In [12]:
#!pip install pyLDAvis


  and should_run_async(code)


In [13]:
# Import necessary libraries
from collections import Counter
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
vis_data = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)

# Compute the topic distribution of the corpus
corpus_topics = [max(ldamodel[doc], key=lambda x: x[1])[0] for doc in doc_term_matrix]
corpus_topic_counts = Counter(corpus_topics)
for topic, count in corpus_topic_counts.items():
    print(f"Topic {topic} occurs in {count} documents")

# Find the most representative document for each topic
for i in range(ldamodel.num_topics):
    topic_props = [(doc_id, prop) for doc_id, topics in enumerate(ldamodel[doc_term_matrix]) for topic_id, prop in topics if topic_id == i]
    most_rep_doc_id, _ = max(topic_props, key=lambda x: x[1])
    print(f"Most representative document for topic {i}: {documents[most_rep_doc_id]}")



  and should_run_async(code)


Topic 2 occurs in 6242 documents
Topic 0 occurs in 3965 documents
Topic 3 occurs in 595 documents
Topic 1 occurs in 371 documents
Topic 4 occurs in 141 documents
Most representative document for topic 0: I have posted disp135.zip to alt.binaries.pictures.utilities


******   You may distribute this program freely for non-commercial use
         if no fee is gained.
******   There is no warranty. The author is not responsible for any
         damage caused by this program.


Important changes since version 1.30:
    Fix bugs in file management system (file displaying).
    Improve file management system (more user-friendly).
    Fix bug in XPM version 3 reading.
    Fix bugs in TARGA reading/writng.
    Fix bug in GEM/IMG reading.
    Add support for PCX and GEM/IMG writing.
    Auto-skip macbinary header.


(1) Introduction:
  This program can let you READ, WRITE and DISPLAY images with different
  formats. It also let you do some special effects(ROTATION, DITHERING ....)
  on image. I