In [15]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt
from bertopic import BERTopic
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
import umap
import seaborn as sns
import numpy as np
import pandas as pd

## Lectura del dataset "app_reviews"

In [63]:
review = pd.read_csv('C:/Users/marit/Documents/UNSA/Text-segmentation-using-Agglomerative-Clustering/extract_dataset/dataset_classroom.csv')
#review = pd.read_csv('C:/Users/USUARIO/Documents/Universidad/4A. Inteligencia Artificial/Dataset/app_reviews.csv')
review

Unnamed: 0,sentences
0,why I can't log in my account!?
1,Can't turn in my activities. Please fix this i...
2,Never shows class work and I have to use the w...
3,"After the recent update, every time I logged i..."
4,Files do not attached
...,...
9995,I dont like this app. Too much activities
9996,Ga bisa buka video dari guru gajelas nih app
9997,Graveyard of students.... RIP students.
9998,worst app i have ever seen


### Hacemos un pre procesamiento de las oraciones

In [56]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [61]:

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = replace_contractions(text)
    return text



In [59]:
review['sentences']=review['sentences'].astype(str)

In [71]:
for i in range (10000):
            review['sentences'][i] = denoise_text(review['sentences'][i])
            print(review['sentences'][i])

why I cannot log in my account!?
Cannot turn in my activities. Please fix this immediately.
Never shows class work and I have to use the website
After the recent update, every time I logged in to my account it keeps telling me that I do not have access to classroom. This has not happened to me before. Also, my account is not the problem here since I can log in to the site just fine.
Files do not attached
I hate online class No chat box in this app
It killed everyone I love and caused the greatest massacre in the quandale dingle universe and threatned me to cut off my arm with the dulingo bird. This all happened in Ohio.
I cannot log in my personal account to Google Class room.. error
School
I cannot stand school it sucks
I want to install
Two things, school and I am salty about myself because I have a LOT of missing work
it does not pay my psychiatrist appointments👎 #unate and #unleftnocrumbs #loonaistwelve #burnbbc
🤬🤬
I cannot switch account
i haet it soo bad
Get to 1 *
Hello Google, 

In [72]:
review

Unnamed: 0,sentences
0,why I cannot log in my account!?
1,Cannot turn in my activities. Please fix this ...
2,Never shows class work and I have to use the w...
3,"After the recent update, every time I logged i..."
4,Files do not attached
...,...
9995,I do not like this app. Too much activities
9996,Ga bisa buka video dari guru gajelas nih app
9997,Graveyard of students.... RIP students.
9998,worst app i have ever seen


### Se extrae en un corpus todos los reviews o criticas de usuario

In [73]:
def extract_corpus(dataset):
    corpus = dataset['sentences']

    return corpus

### Convertir datos en un Dataframe a un manejo más ágil

In [74]:
def convert_corpus_to_dataFrame(corpus):
    print(datetime.today(), "Convirtiendo las oraciones extraidas a un dataframe...")
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

In [76]:
def convert_embbedings_to_dataFrame(embeddings):
    print(datetime.today(), "Convirtiendo las incrustaciones a un dataframe...")
    array = []
    for i in embeddings:
        array.append([i])

    dataset_new = pd.DataFrame(array, columns=['Embeddings'])
    return dataset_new

### Se crea una función que nos permita incrustar las oraciones, para esto usamos un modelo pre-entrenado de SBERT

In [75]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado
    corpus = extract_corpus(dataset)#extraemos un corpus del dataset 
    print(datetime.today(), "Incrustando las oraciones...")
    embeddings = model_embedder.encode(corpus, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True) #normalizamos

    return embeddings, corpus

In [68]:
def neural_embeddings_queries(queries):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado

    embeddings_queries = model_embedder.encode(queries, 
                                        convert_to_tensor=False) #generamos las incrustaciones 

    embeddings_queries = embeddings_queries /  np.linalg.norm(embeddings_queries, axis=0, keepdims=True) #normalizamos

    return embeddings_queries

### Para obtener el "mejor" cluster aplicamos el método de la silueta

In [77]:
#en esta funcion hace la tarea de obtener el mejor k con agglomerative clustering
def silhoutte(dataset, attempts):
    embeddings, corpus = neural_embeddings(dataset)
    print(datetime.today(), "Calculando el mejor k...")
    scores_silhouette = [] #guardaremos todos los resultados del método de la silueta para devolver el mayor

    for k in range(2,attempts+1):
        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, 
                                                            affinity="cosine" , 
                                                            linkage="complete").fit(embeddings)
                                                            
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings, corpus

### Al tener el "mejor" número de clusters, se procede a segmentar las oraciones

In [24]:
def topics_segmentation(dataset_review, attempts):
    n_clusters, embeddings, corpus = silhoutte(dataset_review, attempts) # se le pasa el mejor K

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, 
                                                        affinity="cosine", 
                                                        linkage="complete").fit(embeddings)
                                                        
    cluster_labels = agglomerative_clusterering.labels_ #obtengo las etiquetas respectivas a las oraciones

    model_topics = BERTopic(nr_topics = n_clusters, language='english') # entreno para sacar K temas 
    topics, prob = model_topics.fit_transform(corpus)

    label_topics = model_topics.generate_topic_labels(nr_words=5, topic_prefix=False) # temas
    
    #hierarchical_topics = model_topics.hierarchical_topics(corpus,topics)

    #print(hierarchical_topics)
    #print(label_topics)

    label_topics.pop(0) #elimino el grupo de temas atípicos

    return cluster_labels, label_topics, embeddings, corpus

### A cada oración le asignamos el cluster al que pertenece

In [25]:
def clustering(dataset_review, attempts):
    cluster_labels, label_topics, embeddings, corpus = topics_segmentation(dataset_review, attempts)
    print(datetime.today(), "Asignando un cluster a cada oración...")
    corpus_dataframe = convert_corpus_to_dataFrame(corpus) #de set de oraciones se convierte en un DF para asignarle su número de cluster
    corpus_dataframe['cluster'] = cluster_labels #se le asigna a cada oración un cluster

    return embeddings, label_topics, corpus_dataframe

### Búsqueda semántica para encontrar el tema de cada cluster

In [26]:
def semantic_search(dataset_review, attemps):
    embeddings, label_topics, corpus_dataframe = clustering(dataset_review, attemps) #tomo embeddings para no volver a hacer el mismo trabajo 2 veces
    dataframe_embeddings = convert_embbedings_to_dataFrame(embeddings) #convierto a cada lista de embeddings en un DF para evaluar con cada tema
    dataframe_embeddings['cluster'] = corpus_dataframe['cluster'] # le asigno los clusters
    sort_embeddings =  dataframe_embeddings.sort_values(by=['cluster']) 
    sort_embeddings = sort_embeddings.reset_index(drop=True)
    nr_clusters = sort_embeddings['cluster'].unique() # extrae las un representante de cada cluster

    first_sentences = [] #se almacenara cada primera oracion incrustada de cada cluster para asignarle un topic
    j = 0
    i = 0
    while i < len(sort_embeddings):               
        if(j < len(nr_clusters) and sort_embeddings['cluster'][i] == nr_clusters[j]):
            first_sentences.append(sort_embeddings['Embeddings'][i]) #almacena
            j+=1
        i+=1

    queries = label_topics #queries seran los temas
    topics = [] 
    in_clusters = [] #se almacena los temas y los clusters, para que tengan un mismo índice en común
    print(datetime.today(), "Incrustando los temas...")
    for topic in queries:

        embeddings_queries = neural_embeddings_queries(topic) #incrusta los temas 
        cos_scores = util.cos_sim(embeddings_queries, first_sentences)[0] #se saca la similaridad de cada tema con respecto a las demas oraciones

        cos_scores_numpy = cos_scores.numpy() #se convierte a tensor a numpu
        cos_scores_list = cos_scores_numpy.tolist() #se convierte de numpy a list
        max_coincidence = max(cos_scores_list)
        cluster = cos_scores_list.index(max_coincidence)

        if(len(topics) == 0): 
            topics.append(topic)
            in_clusters.append(cluster)

        elif(topic not in topics and cluster not in in_clusters): #para que no repite un tema con un cluster y viceversa
            topics.append(topic)
            in_clusters.append(cluster)
    tupla = [] ##tamaño k -> k es el tamaño de cluster
    for i in range(len(topics)):
        tupla.append({'Topics': topics[i] , 'Cluster': in_clusters[i]}) #empareja    

    return embeddings, tupla, corpus_dataframe
        

### Mostraremos un gráfico de la segmentación de oraciones y un DataFrame de las oraciones con su respectivo cluster 

In [30]:
def show_themes(dataset_review, attemps):
    embeddings, reporte_tuplas, corpus_dataframe = semantic_search(dataset_review, attemps)
    
    print(datetime.today(), "obteniendo temas...")

    reporte_tuplas = pd.DataFrame(reporte_tuplas, columns=['Topics','Cluster'])
    reporte_tuplas = reporte_tuplas.sort_values(by=['Cluster'])
    reporte_tuplas = reporte_tuplas.reset_index(drop=True)

    assign = [] 
    lista_reporte = list(reporte_tuplas['Cluster'])
    lista_topics = list(corpus_dataframe['cluster'])
    for i in range(len(corpus_dataframe)):
        if(lista_topics[i] in lista_reporte):
            indexs = lista_reporte.index(lista_topics[i])
            assign.append(reporte_tuplas['Topics'][indexs])
        else:
            assign.append("-1")
    
    corpus_dataframe['Topics'] = assign
    
    return corpus_dataframe, reporte_tuplas

In [78]:
dataframe, reporte_tuplas = show_themes(review, 100)

2022-12-26 14:18:52.199194 Incrustando las oraciones...


Batches: 100%|██████████| 313/313 [01:05<00:00,  4.79it/s]


2022-12-26 14:19:57.722071 Calculando el mejor k...
2022-12-26 15:05:58.829807 Asignando un cluster a cada oración...
2022-12-26 15:05:58.832806 Convirtiendo las oraciones extraidas a un dataframe...
2022-12-26 15:05:58.833806 Convirtiendo las incrustaciones a un dataframe...
2022-12-26 15:05:58.886970 Incrustando los temas...
2022-12-26 15:06:25.941105 obteniendo temas...


In [79]:
dataframe

Unnamed: 0,Sentences,cluster,Topics
0,why I cannot log in my account!?,20,account_add_switch_another_cannot
1,Cannot turn in my activities. Please fix this ...,93,-1
2,Never shows class work and I have to use the w...,52,classes_class_refresh_join_found
3,"After the recent update, every time I logged i...",20,account_add_switch_another_cannot
4,Files do not attached,51,pdf_file_preview_files_open
...,...,...,...
9995,I do not like this app. Too much activities,53,-1
9996,Ga bisa buka video dari guru gajelas nih app,34,na_pangit_ang_ng_apps
9997,Graveyard of students.... RIP students.,19,sucks_school_bro_grass_aweful
9998,worst app i have ever seen,53,-1


In [80]:
reporte_tuplas

Unnamed: 0,Topics,Cluster
0,hate_school_sm_just_bro,1
1,poor_slow_very_performance_slower,2
2,crashes_crashing_pp_crash_keeps,3
3,open_files_opening_file_cannot,4
4,worst_world_thing_best_worsted,5
5,tugas_aplikasi_banyak_tolong_nya,6
6,useless_use_hard_app_very,7
7,sucks_sounds_gay_fixed_need,8
8,bugs_bug_many_buggy_issues,11
9,daming_pinapagawa_gawain_dami_pagawa,16
