In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.summarization.keypoints import keywords
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# Load the dataset
twenty_newsgroups = fetch_ucirepo(id=113)
X = twenty_newsgroups.data.features
y = twenty_newsgroups.data.targets

# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return '.join(tokens)

X_preprocessed = [preprocess_text(text) for text in X]

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X_preprocessed)

# Apply LDA for topic modeling
dictionary = Dictionary(X_preprocessed)
corpus = [dictionary.doc2bow(doc) for doc in X_preprocessed]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=15)
topics = lda_model[corpus]

# Apply K-means for document clustering
kmeans = KMeans(n_clusters=50)
kmeans.fit(X_tfidf)
labels = kmeans.labels_

# Print the cluster labels
print(labels)

# Print the top keywords for each topic
for topic in topics:
    print(topic)
    print(keywords(topic).split('\n'))