In [21]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np

'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

''';

In [22]:
# Charger les données NG20
ng20 = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
corpus = ng20.data[:2000]  # Utiliser seulement 2000 documents pour des raisons de démonstration
labels = ng20.target[:2000]

In [23]:
print(np.unique(labels))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [24]:
print(np.unique(corpus))

[''
 "\t daved@world.std.com (Dave T Dorfman) writes...\n]I was enjoying lunch this saturday at foodies in Milford NH with an assortment\n]of other nedod folks when Dean Cookson ( yes he has not left the \n]country, yet) mentioned that the wiring diagram of the VFR750 \n]shows that  the light switch is a three position switch. \n\n]high beam\n]low beam\n]Both beams\n\n]Well the actual ergonomics of the switch make it appear to be a\n]2 position switch, but sure enough as Deam expected , when\n]you balance the toggle switch in the center position both the high\n]and low beams go on.\n\n]This provides a very nice light coverage of the road.\n\n]This is true for the St11 and the VFR750 and I would expect for any \n]other late model Honda with the standard two position light switch.\n\n]Thanks to Dean for reading the schematics, try it you'll like it.\n\n\tBe a bit careful doing this; I used to balance the switch on my GS550B\navec Cibie' H4 insert so that both beams were on.  I eventually

In [25]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP list such that p<<m
    '''
    
    
    pca = PCA(n_components=p)
    
    # Appliquer l'ACP et réduire la dimensionnalité
    red_mat = pca.fit_transform(mat)
    
    
    
   # red_mat = mat[:,:p]
    
    return red_mat

In [26]:
def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list 
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
    
    pred = np.random.randint(k, size=len(corpus))
    
    return pred

In [27]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# perform dimentionality reduction
red_emb = dim_red(embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')


.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

NameError: name 'PCA' is not defined