In [1]:
from sklearn.datasets import fetch_20newsgroups
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_distances,euclidean_distances
import numpy as np
import pandas as pd



In [2]:

# Load a subset of the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=None, remove=('headers', 'footers', 'quotes'))
print(f"Number of documents: {len(newsgroups.data)}")
print(f"Categories: {newsgroups.target_names}")

# Example text
print("\nSample Document:")
print(newsgroups.data[0])

Number of documents: 18846
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Sample Document:


I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the

In [3]:
nlp = spacy.load("en_core_web_lg")
stop_words = nlp.Defaults.stop_words


In [4]:
cleaned = ["".join([j for j in i if (j==' ')|(j.isalpha())|(j==".")]) for i in newsgroups['data']]

In [5]:

vectorizer = CountVectorizer(max_features=1000,stop_words=list(stop_words),ngram_range=(1,1))
tfidf_matrix = vectorizer.fit_transform(cleaned)





In [12]:
k=8
sampled_indices = np.random.choice(tfidf_matrix.shape[0],size=k,replace=False)

centroids = tfidf_matrix[sampled_indices]

In [13]:
for e in range(121):
    dists = cosine_distances(
        tfidf_matrix,
        centroids
    )
    argmins=dists.argmin(axis=1)

    new_centroids = []
    for c in range(k):
        row = tfidf_matrix[argmins==c].mean(axis=0)
        new_centroids.append(row)
    
    new_centroids = np.concat(new_centroids)

    print(e,(new_centroids-centroids).sum())

    if abs((new_centroids-centroids).sum())==0:
        centroids = np.asarray(new_centroids)
        break

    centroids = np.asarray(new_centroids)

dists = cosine_distances(tfidf_matrix,centroids)
argmins = dists.argmin(axis=1)
centroid_dist = dists[:,list(argmins)][:,0]



0 3.606957184009916
1 -6.442308616667087
2 6.444112103951863
3 5.802880009308074
4 4.990114371138757
5 2.780518333525627
6 1.3680870768918287
7 1.9299111886820177
8 1.3426659545259736
9 1.153210224306562
10 0.8452294074683814
11 0.4586010309914681
12 0.2500031050532057
13 -0.17634509016589217
14 0.6724334651127248
15 0.5422929300433812
16 -0.056519899591464584
17 0.021855075220905046
18 0.07221389966179258
19 0.07193794351379527
20 -0.10797819554876542
21 -0.10504814717929234
22 0.11568760479494622
23 0.4509725667361837
24 0.49473300145871585
25 0.38661993192583527
26 0.4340770103380521
27 0.08443093018058012
28 0.47040465147270366
29 0.8826457184801723
30 1.0992180184193252
31 1.4025506701121864
32 0.4314326197194681
33 0.5247212261832831
34 0.5365700422877863
35 -0.016547518484016324
36 0.39683741156135843
37 0.09720140621575918
38 0.15976828187441586
39 0.011615200911132863
40 0.18240734585901897
41 0.03581932560067693
42 -0.04707584977744986
43 0.010730311901287606
44 -0.0223925142

In [14]:
from sklearn.base import BaseEstimator

class KMeansExtended(BaseEstimator):

    def __init__(
        self,
        n_clusters=5,
        metric='cosine',
        epochs=100
    ):
        
        self.n_clusters=n_clusters
        self.epochs=epochs
        self.metric = metric

        if metric=='cosine':
            self.metric_fn = cosine_distances
        else:
            self.metric_fn = euclidean_distances


    def fit(self,X,y=None):
        """Compute k-means clustering.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        sampled_indices = np.random.choice(X.shape[0],size=self.n_clusters,replace=False)

        self.cluster_centers_ = X[sampled_indices]

        for e in range(self.epochs):

            dists = self.metric_fn(
                X,
                self.cluster_centers_
            )
            argmins=dists.argmin(axis=1)

            new_centroids = []
            for c in range(k):
                row = tfidf_matrix[argmins==c].mean(axis=0)
                new_centroids.append(row)
            
            new_centroids = np.concat(new_centroids)

            print(e,(new_centroids-self.cluster_centers_).sum())

            if abs((new_centroids-self.cluster_centers_).sum())==0:
                self.cluster_centers_ = np.asarray(new_centroids)
                break

            self.cluster_centers_ = np.asarray(new_centroids)
  
        dists = self.metric_fn(
            X,
            self.cluster_centers_
        )
        self.labels_ = dists.argmin(axis=1)

        return self

    def predict(self,X):
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            New data to predict.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """

        dists = self.metric_fn(
            X,
            self.cluster_centers_
        )
        labels = dists.argmin(axis=1)

        return labels






In [15]:
kmeans = KMeansExtended(n_clusters=8,metric='cosine',epochs=100)


In [18]:
kmeans.fit(tfidf_matrix)

0 115.23725113560754
1 -6.2972880520556735
2 5.874060186768173
3 3.569672520685888
4 6.408376571615298
5 3.180317365859313
6 4.385665475904328
7 0.11067353179951245
8 0.6211953033640942
9 -0.24194115728031512
10 -0.39047208155088053
11 -0.3843210711856133
12 -0.20113130565395287
13 -0.05072604616951404
14 -0.29161084819883115
15 0.13157637764956975
16 0.2181273430587235
17 0.05081539544042091
18 0.2630487213432062
19 -0.5197934937477174
20 0.24823308626764187
21 -0.25982471684511327
22 -0.38196606125754917
23 -0.45480727479642186
24 0.04604848779459937
25 -0.036469572436946096
26 0.07132225274046305
27 0.1353880573225713
28 0.0010702587990525283
29 0.021662625558124232
30 -0.11379600075426985
31 -0.08828730732204088
32 -0.08501813115008339
33 0.003342635720884446
34 -0.20781074570519117
35 0.06292784346418638
36 0.03989605491397333
37 0.040746236175295325
38 -0.2731702397583625
39 -0.4726553992466029
40 0.022326435111690746
41 0.09827398255583489
42 0.35675391741484364
43 0.54802811362