In [None]:

  '''LIBRARY IMPORTING'''

  ## DATA MANAGEMENT
import pandas as pd
import numpy as np
import collections
from collections import defaultdict

  ## DATA VISUALIZATION
import matplotlib.pyplot as plt

  ## EVALUATION
from time import time
from sklearn.metrics.pairwise import pairwise_distances_argmin

  ## NATURAL LANGUAGE PROCESSING
#import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn import metrics

  ## Specific to K-Means:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer


  ## Specific to Affinity Propagation:
from sklearn.cluster import AffinityPropagation

In [None]:
""" DATASET """

## These are some of the categories in the dataset, feel free to change to different categories outlined in the documentation!
categories = [
    'rec.autos',
    'sci.electronics',
    'talk.religion.misc',
    'talk.politics.misc'
]

  ## The documentation page for the dataset suggests removing headers, footers, and quotes to strip and metadata.
dataset = fetch_20newsgroups(
    remove= ('headers','footers','quotes'),
    subset = 'all',
    categories=categories,
    shuffle = True,
    random_state =10,
)

## generating labels and a true_k to pass into the K Means function
labels = dataset.target
unique_labels,category_sizes = np.unique(labels,return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")

In [None]:
""" INSTANTIATING A K-MEANS CLUSTER """
## First, let's call the TfIDF Vectorizer, and inspect its output

  vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 5, stop_words="english")

  X_tfidf = vectorizer.fit_transform(dataset.data)

## Intalizing storage containers
d = collections.defaultdict(list)
evaluations = []
evaluations_std = []

In [None]:
""" CREATING A FUNCTION """

## Defining the function which will both fit and evaluate at the same time.
## The parameters for this function are a kmeans object,
##  X labels, a name for the evaluation, and amount of training iterations
def fit_and_evaluate(km,X,name=None,n_runs=5):

    ## Storing the iteration name
  name=km.__class__.name__  if name is None else name

  ## Creating a storage container for training iterations and scores
  train_times = []
  scores = d

    ## Iterating through each random state to test different iterations of the model
  for seed in range(n_runs):
    km.set_params(random_state=seed)
    t0 = time()
    km.fit(X)
    train_times.append(time() - t0)

    ## Assessing the model for each iteration on performance metrics.
    scores['Homogeneity'].append(metrics.homogeneity_score(labels, km.labels_))
    scores['Completeness'].append(metrics.completeness_score(labels,km.labels_))
    scores['V-measure'].append(metrics.v_measure_score(labels, km.labels_))
    scores["Adjusted Rand-Index"].append(metrics.adjusted_rand_score(labels, km.labels_))
    scores['Silhouette Coefficient'].append(metrics.silhouette_score(X, km.labels_,sample_size = 2000))


    ## Converting the stored model training times to an array.
  train_times = np.asarray(train_times)

    ## Printing the different iterations training times
  print (f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s")

    ## Storing the evaluation and the evaluation standard deviation to later hold the mean score and STD foreach iteration.
  evaluation = {
      "estimator":name,
      "train_time": train_times.mean(),
  }
  evaluation_std = {
      "estimator":name,
      "train_time":train_times.std(),
  }

    ## Parsing through all of the score values to generate a mean and a standard deviation for the centroids.
  for score_name, score_values in scores.items():
    mean_score, std_score = np.mean(score_values), np.std(score_values)
    print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
    evaluation[score_name] = mean_score
    evaluation_std[score_name] = std_score

  evaluations.append(evaluation)
  evaluations_std.append(evaluation_std)

In [None]:
""" CLUSTERING THE TEXT """

## Defining a for loop to iterate five times and fir a new model each time.
for seed in range(5):

## Establishing a kmeans object with the unique labels identified earlier with a max iteratino of fifty, and the model will only run once with the random state as the seed.
  kmeans_obj=KMeans( n_clusters = true_k, max_iter=50, n_init = 1, random_state=seed)

  ## Fitting the KMeans object to the TF-IDf vectorized data
  kmeans = kmeans_obj.fit(X_tfidf)

  ## Assinging the outputted array to the cluster ids and the cluster sizes.
  cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)

  ## Printing out the number of elements and the actual class labels for each of the four news categories.
  print(f"Number of elements assigned to each cluster: {cluster_sizes}")
print ()
print ("The true number of documents in each category according to the class labels"f"{category_sizes}")

""" NORMALIZING USING LATENT SEMANTIC ANALYSIS """
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

## Assinging the acronym for LSA the truncated SVD role
## The pipeline constructs an object which holds the variables
lsa = make_pipeline(TruncatedSVD(n_components=100))

## Fitting the LSA model
t0 = time()
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()


print (f"LSA done in {time() - t0:.3f} s")
print (f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

## Storing the cluster centers and the labels to use later for graphing
lsa_cluster_centers = kmeans.cluster_centers_
lsa_kmeans_labels = pairwise_distances_argmin(X_lsa, lsa_cluster_centers)


In [None]:
""" EVALUATING THE CLUSTERS """
 ## Non Normalized
fit_and_evaluate(kmeans, X_tfidf, name="KMeans\tf-idf vectors")

  ## LSA Normalized
kmeans = KMeans(
    n_clusters=true_k,
    max_iter=100,
    n_init=1
)

fit_and_evaluate(kmeans,X_lsa,name='KMeans\nwith LSA on tf-idf vectors')



In [None]:
""" VIEWING CLUSTER CONTENT """
from tempfile import TemporaryDirectory
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()

cluster_top5 = []
for i in range(true_k):
  print (f"Cluster {i}: ", end="")

  temp_wordlist = []
  for ind in order_centroids[i, :7]:
    print (f"{terms[ind]} ",end="")
    temp_wordlist.append(terms[ind])

  cluster_top5.append(temp_wordlist)
  print ()

In [None]:
# @title
fig, ax = plt.subplots(figsize=(10,10))
colors = ["#D52941", "#4D9DE0", "#FFB20F","#464D77"]

## iterating through each cluster and extracting the k value and closest y
for k, col in zip(range(0,4), colors):

  ## identify current cluster members if the label value is equivalent to k
  curr_members = lsa_kmeans_labels == k

  ## extracting the cluster_centers
  cluster_center = lsa_cluster_centers[k]

  ## plotting the clusters, curr_members 0 and 1 are the x and y values from the array,
  ## markerfacecolor sets the clusters color, and "w" and "." are stylistic.
  ax.plot(X_lsa[curr_members, 0], X_lsa[curr_members,1], "o", markerfacecolor=col,markeredgecolor='w',markersize=8)

  ## plotting the cluster centers for each column with x and y values with other stylized parameters


## plotting the cluster centers and text last:

for k, col in zip(range(0,4),colors):

  cluster_center = lsa_cluster_centers[k]

  word_str = ""
  for word in cluster_top5[k]:
    word_str = word_str +word+" "

  ax.text(cluster_center[0]+.025,cluster_center[1],word_str,fontsize=12,color="k",weight='bold')

  ax.plot( cluster_center[0], cluster_center[1], "X",
      markerfacecolor = col,
      markeredgecolor = "k",
      markersize = 25
  )

  ## setting plot metadata
ax.set_title("News Group Clustering")
ax.set_xticks(())
ax.set_yticks(());