In [1]:
! pip install prince
! pip install -U sentence-transformers

Collecting prince
  Downloading prince-0.13.0-py3-none-any.whl (415 kB)
     ---------------------------------------- 0.0/415.6 kB ? eta -:--:--
     ----------- -------------------------- 122.9/415.6 kB 7.5 MB/s eta 0:00:01
     -------------------------------------  409.6/415.6 kB 5.1 MB/s eta 0:00:01
     -------------------------------------- 415.6/415.6 kB 4.4 MB/s eta 0:00:00
Collecting altair<6.0.0,>=4.2.2
  Downloading altair-5.2.0-py3-none-any.whl (996 kB)
     ---------------------------------------- 0.0/996.9 kB ? eta -:--:--
     ------------- -------------------------- 337.9/996.9 kB ? eta -:--:--
     --------------------- ---------------- 553.0/996.9 kB 5.8 MB/s eta 0:00:01
     -------------------------------- ----- 849.9/996.9 kB 6.0 MB/s eta 0:00:01
     -------------------------------------- 996.9/996.9 kB 4.5 MB/s eta 0:00:00
Collecting toolz
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
     ---------------------------------------- 0.0/55.8 kB ? eta -:--:--
 


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 0.0/86.0 kB ? eta -:--:--
     --------------------------------- ------ 71.7/86.0 kB 3.8 MB/s eta 0:00:01
     -------------------------------------- 86.0/86.0 kB 805.5 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=2ec4e7163b6e2d3fc9111d1a9b51a36116e87aa785b18a754735429b64c39acd
  Stored in directory: c:\users\akli\appdata\local\pip\cache\wheels\62\f2\10\1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collect


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from prince import PCA as PrincePCA
from sklearn.cluster import KMeans


'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

''';

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP list such that p<<m
    '''
    df_embeddings = pd.DataFrame(mat, columns=[f"feature_{i}" for i in range(len(mat[0]))])

    pca = PrincePCA(n_components=p)
    pca = pca.fit(df_embeddings)
    red_mat = pca.transform(df_embeddings).to_numpy()
    return red_mat

In [5]:
def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list 
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
    
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit(mat)
    pred = kmeans.labels_
    return pred

In [6]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

Downloading (…)".gitattributes";: 100%|██████████| 690/690 [00:00<00:00, 677kB/s]
Downloading (…)ooling/config.json";: 100%|██████████| 190/190 [00:00<00:00, 38.0kB/s]
Downloading (…)"README.md";: 100%|██████████| 3.69k/3.69k [00:00<00:00, 925kB/s]
Downloading (…)"config.json";: 100%|██████████| 629/629 [00:00<?, ?B/s] 
Downloading (…)_transformers.json";: 100%|██████████| 122/122 [00:00<00:00, 9.07kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 90.9M/90.9M [00:08<00:00, 11.1MB/s]
Downloading (…)e_bert_config.json";: 100%|██████████| 53.0/53.0 [00:00<00:00, 4.77kB/s]
Downloading (…)al_tokens_map.json";: 100%|██████████| 112/112 [00:00<00:00, 7.12kB/s]
Downloading (…)"tokenizer.json";: 100%|██████████| 466k/466k [00:00<00:00, 7.50MB/s]
Downloading (…)enizer_config.json";: 100%|██████████| 314/314 [00:00<00:00, 32.3kB/s]
Downloading (…)"vocab.txt";: 100%|██████████| 232k/232k [00:00<00:00, 6.04MB/s]
Downloading (…)"modules.json";: 100%|██████████| 229/229 [00:00<?, ?B/s] 


In [7]:
# perform dimentionality reduction
red_emb = dim_red(embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')



NMI: 0.41 
ARI: 0.23
