In [13]:
import pandas as pd
import numpy as np

In [14]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt

In [15]:
def neural_embeddings(corpus):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado
    embeddings = model_embedder.encode(corpus, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True) #normalizamos

    return embeddings

In [16]:
#en esta funcion hace la tarea de obtener el mejor k con agglomerative clustering
def silhoutte(corpus, attempts):
    embeddings = neural_embeddings(corpus)
    scores_silhouette = [] #guardaremos todos los resultados del método de la silueta para devolver el mayor

    for k in range(2,attempts+1):
        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, 
                                                            affinity="cosine" , 
                                                            linkage="complete").fit(embeddings)
                                                            
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings

In [17]:
def topics_segmentation(corpus, attempts):
    n_clusters, embeddings = silhoutte(corpus, attempts) # se le pasa el mejor K

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, 
                                                        affinity="cosine", 
                                                        linkage="complete").fit(embeddings)
                                                        
    cluster_labels = agglomerative_clusterering.labels_ #obtengo las etiquetas respectivas a las oraciones

    return cluster_labels

In [18]:
dataset = pd.read_csv('C:/Users/Usuario/Desktop/Topic Modelling/Dataset/dataset_classroom.csv')
labels = topics_segmentation(dataset['sentences'], 40)
dataset['cluster'] = labels

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [19]:
dataset

Unnamed: 0,sentences,cluster
0,L app,11
1,my grandma had a seizure and died because of y...,18
2,The pdf scanner keeps croping pages so I need ...,39
3,L app,11
4,Faking auafel,16
...,...,...
19995,They make me do unwanted class work like the o...,13
19996,Yuh,27
19997,Cg,27
19998,Lame!!,27


In [23]:
dataset2 = dataset.sort_values(by=['cluster'])
dataset2

Unnamed: 0,sentences,cluster
10185,"saya benci apk ini asli, pls balik kuliah offl...",0
4632,di nauubusan ng gagawin baka naman pwede niyon...,0
4630,Paki limit naman po kung ilang activity lang a...,0
4629,awat namn po sa activities huhu,0
4628,"NAKAKASAMA NG LOOB, DAPAT IBAN NIYO YUNG TEACH...",0
...,...,...
6620,It's a bug which irritates me file attached po...,39
5687,"Poor service, specially in scanner, it's of no...",39
5289,Worst scanner ever. Poor user interface. I wou...,39
3647,The Popup you get when work is uploaded covers...,39
