In [1]:
import os
import json
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

DATA_PATH = "../data/messages.csv"
VEC_PATH = "../artifacts/log_reg/vec.joblib"
OUT_DIR = "../artifacts/clustering"
os.makedirs(OUT_DIR, exist_ok = True)

In [2]:
def top_terms_per_cluster(kmeans, vec, top_n = 12):
    terms = vec.get_feature_names_out()
    order = kmeans.cluster_centers_.argsort(axis = 1)[:, ::-1]
    clusters = {}
    for i in range(kmeans.n_clusters):
        clusters[str(i)] = [terms[idx]  for idx in order[i, :top_n]]
    return clusters

In [3]:

def main():
    df = pd.read_csv(DATA_PATH)
    texts = df['text'].astype(str).tolist()

    # using the same vectorizer used in logistic regression
    cluster_vec = TfidfVectorizer(
        lowercase = True,
        stop_words = 'english',
        ngram_range = (1,2),
        min_df = 2,
        max_features = 20000
    )

    X = cluster_vec.fit_transform(texts)

    k = 8
    kmeans = KMeans(n_clusters = k, random_state = 42, n_init = "auto")
    labels = kmeans.fit_predict(X)

    df['cluster_tfidf'] = labels
    df.to_csv(os.path.join(OUT_DIR, 'messages_with_clusters.csv'), index = False)

    clusters = top_terms_per_cluster(kmeans, cluster_vec, top_n = 12)
    with open(os.path.join(OUT_DIR, 'cluster_terms.json'), 'w') as f:
        json.dump(clusters, f, indent = 2)

    dump(cluster_vec, os.path.join(OUT_DIR, 'cluster_vectorizer.joblib'))
    dump(kmeans, os.path.join(OUT_DIR, 'kmeans_tfidf.joblib'))

    print("Saved:")
    print(" -", os.path.join(OUT_DIR, "kmeans_tfidf.joblib"))
    print(" -", os.path.join(OUT_DIR, "cluster_terms.json"))
    print(" -", os.path.join(OUT_DIR, "messages_with_clusters.csv"))

In [4]:
if __name__ == "__main__":
    main()

Saved:
 - ../artifacts/clustering\kmeans_tfidf.joblib
 - ../artifacts/clustering\cluster_terms.json
 - ../artifacts/clustering\messages_with_clusters.csv
