In [531]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from bertopic import BERTopic
from datetime import datetime
from bs4 import BeautifulSoup
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [532]:
import numpy as np
import pandas as pd

import re, string, unicodedata
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import contractions
import inflect

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lectura del dataset "app_reviews"

In [533]:
review = pd.read_csv('C:/Users/Usuario/Desktop/Text-segmentation-using-Agglomerative-Clustering/extract_dataset/dataset_classroom.csv')
#review = pd.read_csv('C:/Users/Usuario/Desktop/Text-segmentation-using-Agglomerative-Clustering/extract_dataset/dataset_classroom2.csv')
review

Unnamed: 0,sentences
0,why I can't log in my account!?
1,Can't turn in my activities. Please fix this i...
2,Never shows class work and I have to use the w...
3,"After the recent update, every time I logged i..."
4,Files do not attached
...,...
9995,I dont like this app. Too much activities
9996,Ga bisa buka video dari guru gajelas nih app
9997,Graveyard of students.... RIP students.
9998,worst app i have ever seen


### Hacemos un pre procesamiento de las oraciones

In [534]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def replace_contractions(text):
    return contractions.fix(text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = replace_contractions(text)
    return text

In [535]:
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words


In [536]:
print(datetime.today(), "Preprocesando los datos...")

review['sentences'] = review['sentences'].astype(str)
for i in range(len(review['sentences'])):
    review['sentences'][i] = denoise_text(review['sentences'][i])
    review['sentences'][i] = nltk.word_tokenize(review['sentences'][i])
    review['sentences'][i] = normalize(review['sentences'][i])

review['sentences'] = review['sentences'].astype(str)

2022-12-27 03:06:24.040764 Preprocesando los datos...


In [537]:
review

Unnamed: 0,sentences
0,"['log', 'account']"
1,"['turn', 'activities', 'please', 'fix', 'immed..."
2,"['never', 'shows', 'class', 'work', 'use', 'we..."
3,"['recent', 'update', 'every', 'time', 'logged'..."
4,"['files', 'attached']"
...,...
9995,"['like', 'app', 'much', 'activities']"
9996,"['ga', 'bisa', 'buka', 'video', 'dari', 'guru'..."
9997,"['graveyard', 'students', 'rip', 'students']"
9998,"['worst', 'app', 'ever', 'seen']"


### Se extrae en un corpus todos los reviews o criticas de usuario

In [538]:
def extract_corpus(dataset):
    corpus = dataset['sentences']

    return corpus

### Convertir datos en un Dataframe a un manejo más ágil

In [539]:
def convert_corpus_to_dataFrame(corpus):
    print(datetime.today(), "Convirtiendo las oraciones extraidas a un dataframe...")
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

In [540]:
def convert_embbedings_to_dataFrame(embeddings):
    print(datetime.today(), "Convirtiendo las incrustaciones a un dataframe...")
    array = []
    for i in embeddings:
        array.append([i])

    dataset_new = pd.DataFrame(array, columns=['Embeddings'])
    return dataset_new

### Se crea una función que nos permita incrustar las oraciones, para esto usamos un modelo pre-entrenado de SBERT

In [541]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado
    corpus = extract_corpus(dataset)#extraemos un corpus del dataset 
    print(datetime.today(), "Incrustando las oraciones...")
    embeddings = model_embedder.encode(corpus, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True) #normalizamos

    return embeddings, corpus

In [542]:
def neural_embeddings_queries(queries):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado

    embeddings_queries = model_embedder.encode(queries, 
                                        convert_to_tensor=False) #generamos las incrustaciones 

    embeddings_queries = embeddings_queries /  np.linalg.norm(embeddings_queries, axis=0, keepdims=True) #normalizamos

    return embeddings_queries

### Para obtener el "mejor" cluster aplicamos el método de la silueta

In [543]:
#en esta funcion hace la tarea de obtener el mejor k con agglomerative clustering
def silhoutte(dataset, attempts):
    embeddings, corpus = neural_embeddings(dataset)
    print(datetime.today(), "Calculando el mejor k...")
    scores_silhouette = [] #guardaremos todos los resultados del método de la silueta para devolver el mayor

    for k in range(2,attempts+1):
        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, 
                                                            affinity="cosine" , 
                                                            linkage="complete").fit(embeddings)
                                                            
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings, corpus

### Al tener el "mejor" número de clusters, se procede a segmentar las oraciones

In [544]:
def topics_segmentation(dataset_review, attempts):
    n_clusters, embeddings, corpus = silhoutte(dataset_review, attempts) # se le pasa el mejor K

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, 
                                                        affinity="cosine", 
                                                        linkage="complete").fit(embeddings)
                                                        
    cluster_labels = agglomerative_clusterering.labels_ #obtengo las etiquetas respectivas a las oraciones

    model_topics = BERTopic(nr_topics = n_clusters, language='english') # entreno para sacar K temas 
    topics, prob = model_topics.fit_transform(corpus)

    label_topics = model_topics.generate_topic_labels(nr_words=5, topic_prefix=False) # temas
    label_topics.pop(0) #elimino el grupo de temas atípicos

    return cluster_labels, label_topics, embeddings, corpus

### A cada oración le asignamos el cluster al que pertenece

In [545]:
def clustering(dataset_review, attempts):
    cluster_labels, label_topics, embeddings, corpus = topics_segmentation(dataset_review, attempts)
    print(datetime.today(), "Asignando un cluster a cada oración...")
    corpus_dataframe = convert_corpus_to_dataFrame(corpus) #de set de oraciones se convierte en un DF para asignarle su número de cluster
    corpus_dataframe['cluster'] = cluster_labels #se le asigna a cada oración un cluster

    return embeddings, label_topics, corpus_dataframe

### Búsqueda semántica para encontrar el tema de cada cluster

In [546]:
def semantic_search(dataset_review, attemps):
    embeddings, label_topics, corpus_dataframe = clustering(dataset_review, attemps) #tomo embeddings para no volver a hacer el mismo trabajo 2 veces
    dataframe_embeddings = convert_embbedings_to_dataFrame(embeddings) #convierto a cada lista de embeddings en un DF para evaluar con cada tema
    dataframe_embeddings['cluster'] = corpus_dataframe['cluster'] # le asigno los clusters
    sort_embeddings =  dataframe_embeddings.sort_values(by=['cluster']) 
    sort_embeddings = sort_embeddings.reset_index(drop=True)
    nr_clusters = sort_embeddings['cluster'].unique() # extrae las un representante de cada cluster

    first_sentences = [] #se almacenara cada primera oracion incrustada de cada cluster para asignarle un topic
    j = 0
    i = 0
    while i < len(sort_embeddings):               
        if(j < len(nr_clusters) and sort_embeddings['cluster'][i] == nr_clusters[j]):
            first_sentences.append(sort_embeddings['Embeddings'][i]) #almacena
            j+=1
        i+=1

    queries = label_topics #queries seran los temas
    topics = [] 
    in_clusters = [] #se almacena los temas y los clusters, para que tengan un mismo índice en común
    print(datetime.today(), "Incrustando los temas...")
    for topic in queries:

        embeddings_queries = neural_embeddings_queries(topic) #incrusta los temas 
        cos_scores = util.cos_sim(embeddings_queries, first_sentences)[0] #se saca la similaridad de cada tema con respecto a las demas oraciones

        cos_scores_numpy = cos_scores.numpy() #se convierte a tensor a numpu
        cos_scores_list = cos_scores_numpy.tolist() #se convierte de numpy a list
        max_coincidence = max(cos_scores_list)
        cluster = cos_scores_list.index(max_coincidence)

        if(len(topics) == 0): 
            topics.append(topic)
            in_clusters.append(cluster)

        elif(topic not in topics and cluster not in in_clusters): #para que no repite un tema con un cluster y viceversa
            topics.append(topic)
            in_clusters.append(cluster)
    tupla = [] ##tamaño k -> k es el tamaño de cluster
    for i in range(len(topics)):
        tupla.append({'Topics': topics[i] , 'Cluster': in_clusters[i]}) #empareja    

    return embeddings, tupla, corpus_dataframe
        

### Mostraremos un gráfico de la segmentación de oraciones y un DataFrame de las oraciones con su respectivo cluster 

In [547]:
def show_themes(dataset_review, attemps):
    embeddings, reporte_tuplas, corpus_dataframe = semantic_search(dataset_review, attemps)
    
    print(datetime.today(), "obteniendo temas...")

    reporte_tuplas = pd.DataFrame(reporte_tuplas, columns=['Topics','Cluster'])
    reporte_tuplas = reporte_tuplas.sort_values(by=['Cluster'])
    reporte_tuplas = reporte_tuplas.reset_index(drop=True)

    assign = [] 
    lista_reporte = list(reporte_tuplas['Cluster'])
    lista_topics = list(corpus_dataframe['cluster'])
    for i in range(len(corpus_dataframe)):
        if(lista_topics[i] in lista_reporte):
            indexs = lista_reporte.index(lista_topics[i])
            assign.append(reporte_tuplas['Topics'][indexs])
        else:
            assign.append("-1")
    
    corpus_dataframe['Topics'] = assign
    
    return corpus_dataframe, reporte_tuplas

In [548]:
dataframe, reporte_tuplas = show_themes(review, 100)

2022-12-27 03:06:42.116074 Incrustando las oraciones...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2022-12-27 03:07:56.136132 Calculando el mejor k...
2022-12-27 03:59:03.054457 Asignando un cluster a cada oración...
2022-12-27 03:59:03.055353 Convirtiendo las oraciones extraidas a un dataframe...
2022-12-27 03:59:03.057106 Convirtiendo las incrustaciones a un dataframe...
2022-12-27 03:59:03.193682 Incrustando los temas...
2022-12-27 03:59:52.009865 obteniendo temas...


In [549]:
dataframe

Unnamed: 0,Sentences,cluster,Topics
0,"['log', 'account']",37,-1
1,"['turn', 'activities', 'please', 'fix', 'immed...",90,super_complicated_difficult_hard_use
2,"['never', 'shows', 'class', 'work', 'use', 'we...",88,-1
3,"['recent', 'update', 'every', 'time', 'logged'...",3,account_switch_login_booo_log
4,"['files', 'attached']",18,-1
...,...,...,...
9995,"['like', 'app', 'much', 'activities']",65,bad_app_veryyyyyyyyyyyy_comment_lol
9996,"['ga', 'bisa', 'buka', 'video', 'dari', 'guru'...",59,hate_app_window_suck_school
9997,"['graveyard', 'students', 'rip', 'students']",16,hate_school_hates_love_despise
9998,"['worst', 'app', 'ever', 'seen']",59,hate_app_window_suck_school


In [550]:
reporte_tuplas

Unnamed: 0,Topics,Cluster
0,google_classroom_lecture_access_went,2
1,account_switch_login_booo_log,3
2,gg_baf_bleh_sucs_glichy,4
3,balls_dog_dogshit_water_hamburger,5
4,school_related_work_schoolwork_made,6
5,ayoko_aral_mag_na_ko,7
6,classroom_google_problem_assignments_app,8
7,ew_eww_wew_screwww_luwhhhh,9
8,dark_mode_racist_theme_darkmode,10
9,submit_know_assignments_assignment_work,13
