In [5]:
!pip install -U sentence-transformers
!pip install prince

Collecting prince
  Downloading prince-0.13.0-py3-none-any.whl (415 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/415.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/415.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.6/415.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: prince
Successfully installed prince-0.13.0


In [12]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np
import prince
import pandas as pd
from sklearn.cluster import KMeans

In [26]:

def clust(mat, k):
    '''
    Perform clustering using k-means

    Input:
    -----
        mat : input list or array
        k : number of clusters
    Output:
    ------
        pred : list of predicted labels
    '''

    kmeans = KMeans(n_clusters=k, random_state=42).fit(mat)

    # Get the predicted labels
    pred = kmeans.labels_

    return pred


In [23]:
def dim_red(method,mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        method : ACP or AFC
        mat : NxM list
        p : number of dimensions to keep
    Output:
    ------
        red_mat : NxP list such that p<<m
    '''

    if method == "ACP":
      df = pd.DataFrame(mat)
      pca = prince.PCA(n_components=p)
      pca = pca.fit(df)
      return pca.transform(df)

    elif method == "AFC":
      # check if there exists negative values in the data
      has_negative_values = np.any(mat < 0)

      if has_negative_values:
        print("The matrix contains negative values, applying shift")
        # Shift data to make it non-negative
        min_value = np.min(mat)
        shifted_data = mat - min_value + 1e-10
        df = pd.DataFrame(shifted_data)
      else:
        df = pd.DataFrame(mat)

      ca = prince.CA(n_components=p)
      ca = ca.fit(df)
      return ca.row_coordinates(df)

    else: #Error method not accepted (not ACP or AFC)
      raise ValueError("ERROR : This method of dimentionality reduction is not accepted: {}".format(method))



In [28]:
#red_emb = dim_red("AFC", embeddings, 20)
print((red_emb.shape))
predictions = clust(red_emb, k=3)
print("Predicted labels:", predictions)


(2000, 20)




Predicted labels: [1 1 0 ... 1 1 1]


In [31]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

test for AFC

In [32]:
# perform dimentionality reduction
red_emb = dim_red("AFC", embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')

The matrix contains negative values, applying shift




NMI: 0.41 
ARI: 0.23


# test for ACP

In [33]:
red_emb = dim_red("ACP", embeddings, 20)

pred = clust(red_emb, k)

nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')



NMI: 0.41 
ARI: 0.24
