In [None]:
from sklearn.metrics                 import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import HashingVectorizer , TfidfTransformer
from sklearn.pipeline                import make_pipeline
from sklearn.decomposition           import TruncatedSVD
from sklearn.preprocessing           import Normalizer
from sklearn.cluster                 import KMeans

lsa_vectorizer = make_pipeline(
    HashingVectorizer(stop_words="english", n_features=1_000),
    TfidfTransformer(),
    TruncatedSVD(n_components=50, random_state=0),
    Normalizer(copy=False),
)

from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy  as np
import requests
import json
import gzip
import os

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [None]:
def kmeans_tune(X_lsa, n_clusters_max = 15,trials = 5) -> list[float]:
    results = []
    for n_clusters in range(2,n_clusters_max):
        c_score = 0
        for seed in range(trials):
            kmeans = KMeans(
                max_iter     = 100,
                n_clusters   = n_clusters,
                n_init       = 10,
                random_state = seed,
            ).fit(X_lsa)
            c_score += silhouette_score(X_lsa,kmeans.labels_)

        c_score /= trials
        results.append(float(c_score))
        # cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
        # print(f"Number of elements assigned to each cluster: {cluster_sizes}")
    return results

In [None]:
def query(url, data, mode):
    match mode:
        case 'POST':
            item = requests.post(url,json=data)
            item = json.loads(item.content.decode())
        case 'GET':
            item = requests.get(url)
            item = json.loads(item.content.decode())
    return item

In [None]:
url  = "http://127.0.0.1:5000/get_descs"
    
data = query(url = url, data = None, mode = 'GET')
IDS, desc = zip(*data)
IDS, documents = np.array(IDS), np.array(desc)

In [None]:
X_lsa        = lsa_vectorizer.fit_transform(documents)
results      = kmeans_tune(X_lsa, trials=5)
n_clusters   = np.argmin(results) + 2

kmeans = KMeans(
        max_iter     = 100,
        n_clusters   = n_clusters,
        n_init       = 5,
        random_state = 42,
    ).fit(X_lsa)

# Group each document IDS with their respective label
clusters  = [(int(kmeans.labels_[idx]), d) for idx, d in enumerate(IDS)]
# Use search id to find which label an id belongs to.
search_id = {d: int(kmeans.labels_[idx]) for idx, d in enumerate(IDS)}
search_id = {int(ID): int(label) for ID, label in search_id.items()}

# n_clusters = max(clusters, key = lambda x: x[0])[0]

groups = {}
for n in range(n_clusters):
    c = [id for label, id in clusters if label == n]
    groups[f'GROUP {n}'] = c

In [None]:
with open(os.path.join('Data','Groupings.json'),mode = 'w') as f:
    json.dump(groups   ,f, cls = NpEncoder,indent=3)

with open(os.path.join('Data','Search_ID.json'),mode = 'w') as f:
    json.dump(search_id,f, cls = NpEncoder,indent=3)

In [None]:
with open(os.path.join('Data','Groupings.json'), mode = 'r') as f:
    groups = json.load(f)

In [None]:
plt.scatter( y= results,x = [i + 2 for i in range(len(results))])
plt.xticks([i + 2 for i in range(len(results))])
plt.show()

In [None]:
from sklearn.manifold import TSNE 

enc  = TSNE(n_components=2, random_state=42)
cord = enc.fit_transform(X_lsa)
plt.scatter(x = cord[:,0], y = cord[:,1], c= kmeans.labels_)
plt.show()

In [None]:
# def best_k(results: list, cutoff = 0.98):
#     """
#     Takes an array of reuslts and finds the 
#     index where the running summation croses the threshold.
#     """
#     difference = []
#     for idx in range(1,len(results)):
#         diff = results[idx] - results[idx - 1]
#         difference.append(diff)

#     total = 0
#     diff_total = sum(diff)
#     values = [d/diff_total for d in diff]
#     for idx, val in enumerate(values):
#         total += val
#         if total >= cutoff:
#             return (idx + 2)