In [17]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np

from sklearn.decomposition import FactorAnalysis # library used for AFC

from sklearn.cluster import KMeans # library used for AFC

'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

''';

In [18]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction using Factor Analysis (l'AFC)

    Input:
    -----
        mat : NxM list or array-like 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP array such that p<<m
    '''
    # Convert the input list to a NumPy array if it's not already an array
    mat_np = mat if isinstance(mat, np.ndarray) else np.array(mat)
    
    # Initialize Factor Analysis model with desired number of components
    fa = FactorAnalysis(n_components=p)
    
    # Fit and transform the data to the lower-dimensional space
    red_mat = fa.fit_transform(mat_np)
    
    return red_mat


In [19]:
def clust(mat, k):
    '''
    Perform clustering using KMeans

    Input:
    -----
        mat : input list or array-like
        k : number of clusters
    Output:
    ------
        pred : list of predicted labels
    '''
    # Convert the input list to a NumPy array if it's not already an array
    mat_np = mat if isinstance(mat, np.ndarray) else np.array(mat)
    
    # Initialize KMeans model with the desired number of clusters
    kmeans = KMeans(n_clusters=k)
    
    # Fit KMeans to the data and predict cluster labels
    pred = kmeans.fit_predict(mat_np)
    
    return pred.tolist()


In [20]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# perform dimentionality reduction
red_emb = dim_red(embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')


Downloading (…)".gitattributes";:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)ooling/config.json";:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)"README.md";:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)_transformers.json";:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)e_bert_config.json";:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)al_tokens_map.json";:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)"vocab.txt";:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)"modules.json";:   0%|          | 0.00/229 [00:00<?, ?B/s]



NMI: 0.40 
ARI: 0.23
