In [198]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt
from autocorrect import Speller #autocorrector
from bertopic import BERTopic

In [199]:
import umap
import seaborn as sns
import numpy as np
import pandas as pd
import re
import itertools
import tensorflow as tf

In [183]:
%pip install bertopic

Note: you may need to restart the kernel to use updated packages.


## Lectura del dataset "app_reviews"

In [184]:
#review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')
review = pd.read_csv('C:/Users/USUARIO/Documents/Universidad/4A. Inteligencia Artificial/Dataset/app_reviews.csv')
review

Unnamed: 0,package_name,review,date,star
0,com.mantz_it.rfanalyzer,Great app! The new version now works on my Bra...,October 12 2016,4
1,com.mantz_it.rfanalyzer,Great It's not fully optimised and has some is...,August 23 2016,4
2,com.mantz_it.rfanalyzer,Works on a Nexus 6p I'm still messing around w...,August 04 2016,5
3,com.mantz_it.rfanalyzer,The bandwidth seemed to be limited to maximum ...,July 25 2016,3
4,com.mantz_it.rfanalyzer,Works well with my Hackrf Hopefully new update...,July 22 2016,5
...,...,...,...,...
288060,com.termux.api,it doesn't do anything after installing this i...,June 24 2016,3
288061,com.termux.api,I like this app . Its is very helpful for use....,June 20 2016,5
288062,com.termux.api,Finally Brings back the Unix command line to A...,May 20 2016,5
288063,com.termux.api,The API feature is great just need loads more...,May 05 2016,5


## Limpieza del dataset 

In [185]:
def clean_corpus(corpus_review):
        
    for i in range(len(corpus_review)):        
        corpus_review[i] = re.sub(r'https?:\/\/.\S+', "", corpus_review[i]) 
        corpus_review[i] = re.sub(r'"', '', corpus_review[i]) 
        corpus_review[i] = re.sub(r'#', '', corpus_review[i]) 
        corpus_review[i] = re.sub(r'^RT[\s]+', '', corpus_review[i]) 
              
        Apos_dict={"'s":" is","n't":" not","'m":" am","'    ll":" will", 
               "'d":" would","'ve":" have","'re":" are"} #reemplazar apostrofes    
          
        for key,value in Apos_dict.items(): 
            if key in corpus_review[i]: 
                corpus_review[i]=corpus_review[i].replace(key,value) #reemplazar

        corpus_review[i] = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",corpus_review[i]) if s])
        corpus_review[i]=corpus_review[i].lower() #minusculas

        file=open("slang.txt","r") #jergas del ingles
        slang=file.read() 
          
        slang=slang.split('\n') 
          
        tweet_tokens= corpus_review[i].split() 
        slang_word=[] 
        meaning=[] 
          
        for line in slang: 
            temp=line.split("=") 
            slang_word.append(temp[0]) 
            meaning.append(temp[-1]) 
          
        for i,word in enumerate(tweet_tokens): 
            if word in slang_word: 
                idx=slang_word.index(word) 
                tweet_tokens[i]=meaning[idx] 
                  
        corpus_review[i]=" ".join(tweet_tokens) 
        corpus_review[i] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(corpus_review[i]))   
 
        spell = Speller(lang='en') 
        corpus_review[i]=spell(corpus_review[i]) 
    return corpus_review
        

### Se extrae en un corpus todos los reviews o criticas de usuario

In [186]:
def extract_corpus(dataset):
    lista = []  
    for i in range(len(dataset['package_name'].unique())):#iterar entre los package_name unicos
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})#otener un package_name y el número de oraciones

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)#se ordena para saber que package_name tiene el mayor n° de oraciones
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]#el mayor será el elemnto que ocupa la posicion 0
    corpus = list(dataframe['review'])#extraemos un corpus
    
    #corpus = clean_corpus(corpus)

    return corpus

### Convertir datos en un Dataframe a un manejo más ágil

In [187]:
def convert_corpus_to_dataFrame(corpus):
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

In [188]:
def convert_embbedings_to_dataFrame(embeddings):
    array = []
    for i in embeddings:
        array.append([i])

    dataset_new = pd.DataFrame(array, columns=['Embeddings'])
    return dataset_new

### Se crea una función que nos permita incrustar las oraciones, para esto usamos un modelo pre-entrenado de SBERT

In [189]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado
    corpus = extract_corpus(dataset)#extraemos un corpus del dataset 

    embeddings = model_embedder.encode(corpus, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True) #normalizamos

    return embeddings, corpus

In [190]:
def neural_embeddings_queries(queries):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado

    embeddings_queries = model_embedder.encode(queries, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings_queries = embeddings_queries /  np.linalg.norm(embeddings_queries, axis=0, keepdims=True) #normalizamos

    return embeddings_queries

### Para obtener el "mejor" cluster aplicamos el método de la silueta

In [191]:
#en esta funcion hace la tarea de obtener el mejor k con agglomerative clustering
def silhoutte(dataset, attempts):

    embeddings, corpus = neural_embeddings(dataset)
    scores_silhouette = [] #guardaremos todos los resultados del método de la silueta para devolver el mayor

    for k in range(2,attempts+1):

        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, 
                                                            affinity="cosine" , 
                                                            linkage="complete").fit(embeddings)
                                                            
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings, corpus

### Al tener el "mejor" número de clusters, se procede a segmentar las oraciones

In [192]:
def topics_segmentation(dataset_review, attempts):
    n_clusters, embeddings, corpus = silhoutte(dataset_review, attempts) # se le pasa el mejor K

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, 
                                                        affinity="cosine", 
                                                        linkage="complete").fit(embeddings)
                                                        
    cluster_labels = agglomerative_clusterering.labels_ #obtengo las etiquetas respectivas a las oraciones

    model_topics = BERTopic(nr_topics = n_clusters).fit(corpus) # entreno para sacar K temas 
    label_topics = model_topics.generate_topic_labels() # temas
    label_topics.pop(0) #elimino el grupo de temas atípicos

    print(label_topics)

    return cluster_labels, label_topics, embeddings, corpus

### A cada oración le asignamos el cluster al que pertenece

In [193]:
def clustering(dataset_review, attempts):
    cluster_labels, label_topics, embeddings, corpus = topics_segmentation(dataset_review, attempts)

    corpus_dataframe = convert_corpus_to_dataFrame(corpus) #de set de oraciones se convierte en un DF para asignarle su número de cluster
    corpus_dataframe['cluster'] = cluster_labels #se le asigna a cada oración un cluster
    
    '''topics = [] # creo una lista para llenar con los temas segun el cluster respectivo
    for i in corpus_dataframe['cluster']: 
        topics.append(label_topics[i])

    corpus_dataframe['Topics'] = topics #creo una columna TOPICS que indica el tema de cada cluster'''

    return embeddings, label_topics, corpus_dataframe

### Búsqueda semántica para encontrar el tema de cada cluster

In [204]:
def semantic_search(dataset_review, attemps):
    embeddings, label_topics, corpus_dataframe = clustering(dataset_review, attemps)
    dataframe_embeddings = convert_embbedings_to_dataFrame(embeddings)

    dataframe_embeddings['cluster'] = corpus_dataframe['cluster']
    sort_embeddings =  dataframe_embeddings.sort_values(by=['cluster'])
    sort_embeddings = sort_embeddings.reset_index(drop=True)
    nr_clusters = sort_embeddings['cluster'].unique()
    
    print(len(nr_clusters))
    print(len(sort_embeddings))
    print(sort_embeddings)
    print(nr_clusters)

    first_sentences = []
    j = 0
    for i in range(len(sort_embeddings)):               
        if(j < len(nr_clusters) and sort_embeddings['cluster'][i] == nr_clusters[j]):
            print("j: ",j)
            first_sentences.append(sort_embeddings['Embeddings'][i])
            j+=1

    print(first_sentences)
    print(len(first_sentences))

    queries = label_topics
    topics = []
    for query in queries:
        embeddings_queries = neural_embeddings_queries(query)
        cos_scores = util.cos_sim(embeddings_queries, first_sentences)[0]

        cos_scores_numpy = cos_scores.numpy()
        cos_scores_list = cos_scores_numpy.tolist()
        max_coincidence = max(cos_scores_list)
        index = cos_scores_list.index(max_coincidence)
        topics.append({query , index})

    print(topics)
        

In [205]:
semantic_search(review, 40)

Batches:   0%|          | 0/66 [00:00<?, ?it/s]

['0_app_great_good', '1_weather_the_not']
2
2099
                                             Embeddings  cluster
0     [-0.11320456, 0.027211675, 0.023340745, -0.053...        0
1     [-0.123598136, 0.062280484, 0.05117411, -0.081...        0
2     [-0.041777987, 0.10871419, 0.022554228, -0.080...        0
3     [-0.05273707, 0.0024701075, -0.0006009123, 0.0...        0
4     [-0.020272695, -0.036391795, 0.03406607, -0.03...        0
...                                                 ...      ...
2094  [-0.040672235, -0.009908965, 0.025821224, -0.0...        1
2095  [0.005858691, 0.024118843, 0.008889248, 0.0503...        1
2096  [-0.044284828, -0.024861367, -0.01832826, 0.04...        1
2097  [-0.053155173, 0.063662276, 0.031657554, -0.03...        1
2098  [-0.11557477, 0.09539389, -0.01849673, -0.0799...        1

[2099 rows x 2 columns]
[0 1]
j:  0
j:  1
[array([-1.13204561e-01,  2.72116754e-02,  2.33407449e-02, -5.30632660e-02,
        5.66631323e-03,  2.30870359e-02,  3.20470445

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'0_app_great_good', 1}, {'1_weather_the_not', 1}]


In [203]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=False)

queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']

topics = []
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=False)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores_numpy = cos_scores.numpy()
    cos_scores_list = cos_scores_numpy.tolist()
    max_coincidence = max(cos_scores_list)
    index = cos_scores_list.index(max_coincidence)
    topics.append({query , index})

topics


[{0, 'A man is eating pasta.'},
 {7, 'Someone in a gorilla costume is playing a set of drums.'},
 {8, 'A cheetah chases prey on across a field.'}]

### Para dar visualización de estas oraciones en un plano euclidiano, aplicamos una técnica de reducción de dimensiones 

In [None]:
def dimension_reduction(embeddings):
    scaler = umap.UMAP(n_components=2).fit_transform(embeddings)
    dimension_2d = pd.DataFrame(scaler, columns=['x', 'y'])
    return dimension_2d

### Mostraremos un gráfico de la segmentación de oraciones y un DataFrame de las oraciones con su respectivo cluster 

In [None]:
def show_graphics_and_themes(dataset_review, attemps):
    embeddings, corpus_dataframe = clustering(dataset_review, attemps)
    review_2d = dimension_reduction(embeddings) #se reduce a dos dimensiones las incrustaciones para poder plotear

    review_2d['labels'] = corpus_dataframe['cluster']
    review_2d['Topics'] = corpus_dataframe['Topics'] 
    
    plt.figure(figsize=(20, 20))
    clustered = review_2d[review_2d.labels != -1]
    #print("clustered", clustered)
    plt.scatter(review_2d.x, 
                review_2d.y, 
                c=clustered.labels, 
                s=20,
                cmap='Spectral')
    #plt.legend(review_2d['Topics'])
    plt.colorbar()
    plt.show()
    
    return corpus_dataframe

In [None]:
#topics = show_graphics_and_themes(review, 40)
#topics

In [None]:
#topics['Topics'].unique()

NameError: name 'topics' is not defined

In [None]:
'''def show_dimentions(distribution):
    for col in 'xy':
        sns.kdeplot(distribution[col], shade=True)

    with sns.axes_style(style='ticks'):
       g = sns.factorplot(data=distribution, kind="box")'''

'def show_dimentions(distribution):\n    for col in \'xy\':\n        sns.kdeplot(distribution[col], shade=True)\n\n    with sns.axes_style(style=\'ticks\'):\n       g = sns.factorplot(data=distribution, kind="box")'