**Install packages**

In [None]:
#install nltk package
!pip install pandas nltk

**Load and Preprocess the Dataset**

In [None]:
# import all packages
import pandas as pd
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re
import nltk

nltk.download('punkt') # downloads the NLTK data for tokenization
nltk.download('stopwords') # downloads the NLTK data for stopwords
nltk.download('wordnet') # downloads the NLTK data for WordNet lemmatizer

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation)) # removes punctuation from the text using the translate method

    # Tokenize
    tokens = word_tokenize(text) # tokenizes the text into individual words

    # Remove stopwords
    stop_words = set(stopwords.words('english')) # creates a set of stopwords for English language
    tokens = [w for w in tokens if not w in stop_words] # removes stopwords from the list of tokens

    # Lemmatize
    lemmatizer = WordNetLemmatizer() # creates an instance of the WordNetLemmatizer
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # applies lemmatization to each token in the list
    return tokens

# Load the dataset
dataset = pd.read_json('/content/News_Category_Dataset_v3.json', lines=True)

df = pd.DataFrame(dataset)
print(df)
print("\n")
# Preprocess the news headline & short description
df['processed_text'] = df.apply(lambda row: preprocess_text(row['headline'] + " " + row['short_description']), axis=1)
print(df['processed_text'])

**Train Word2Vec Model**

In [None]:
from gensim.models import Word2Vec # imports the Word2Vec class from the gensim.models module
sentences = df['processed_text'].tolist() # Prepare sentences: a list of lists of words
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4) # creates Word2Vec model & train the model
w2v_model.save("news_category_w2v.model") # saves the trained Word2Vec model

**Load Word2Vec Model and Vectorize New Texts**

In [None]:
from gensim.models import Word2Vec # imports the Word2Vec class from the gensim.models module
import numpy as np # imports the numpy library

w2v_model = Word2Vec.load("news_category_w2v.model") # Load the trained model
def vectorize_text(text, model): # vectorize_text takes two parameters
    # Preprocess the text to split into words and apply any other preprocessing
    words = preprocess_text(text)  # preprocess_text function
    word_vectors = np.array([model.wv[word] for word in words if word in model.wv.key_to_index]) # initializes empty array, iterates each word & added to the word_vectors array
    if len(word_vectors) > 0: # Check at least one word vector
        return np.mean(word_vectors, axis=0) # compute the mean
    else:
        return np.zeros(model.vector_size) # otherwise return a zero vector
dataset['vector'] = dataset['processed_text'].apply(lambda x: vectorize_text(' '.join(x), w2v_model)) # function called the joined preprocessed words & trained Word2Vec model (w2v_model)

In [None]:
def document_vector(doc, w2v_model):
    doc = [word for word in doc if word in w2v_model.wv.key_to_index] # updated to check for word
    if not doc: # checks if the doc list is empty
        return np.zeros(w2v_model.vector_size) # returns a zero vector
    return np.mean([w2v_model.wv[word] for word in doc], axis=0) # computes the mean vector
X = np.array([document_vector(doc, w2v_model) for doc in df['processed_text']]) # creates an array X, iterate each document in the processed_text
print(X) # prints the array X

**Cluster the Articles**

In [None]:
from sklearn.cluster import KMeans # import KMeans
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42) # it takes several parameters
dataset['cluster'] = kmeans.fit_predict(np.vstack(dataset['vector'].values)) # fit the model, vertically stacks the vectors
print(dataset['cluster']) # Print cluster labels

**Automated Evaluation of Clustering**

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42).fit(X)
labels = kmeans.labels_

# Silhouette Score
silhouette_avg = silhouette_score(X, labels) # computes the average silhouette score
print(f"Silhouette Score: {silhouette_avg}")

# Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(X, labels) # computes the Davies-Bouldin index
print(f"Davies-Bouldin Index: {davies_bouldin}")

# Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(X, labels) # computes the Calinski-Harabasz index
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

**Dynamically Adjusting Hyperparameters**

In [None]:
best_score = -1 # initializes the variable
best_k = 2 # initializes the variable best_k
for k in range(2, 10): # starts a loop 2 to 9
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
    score = silhouette_score(X, kmeans.labels_) # computes the silhouette score with current value of k
    if score > best_score: # check best score
        best_score = score
        best_k = k

print(f"Best K: {best_k} with Silhouette Score: {best_score}") # print best k