In [18]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt
from bertopic import BERTopic
from datetime import datetime
from autocorrect import Speller #autocorrector

In [19]:
import umap
import seaborn as sns
import numpy as np
import pandas as pd
import re
import itertools

### Lectura del dataset "app_reviews"

In [20]:
review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')
review = review.drop(['date','star'],axis=1)
#review = pd.read_csv('C:/Users/USUARIO/Documents/Universidad/4A. Inteligencia Artificial/Dataset/app_reviews.csv')
review

Unnamed: 0,package_name,review
0,com.mantz_it.rfanalyzer,Great app! The new version now works on my Bra...
1,com.mantz_it.rfanalyzer,Great It's not fully optimised and has some is...
2,com.mantz_it.rfanalyzer,Works on a Nexus 6p I'm still messing around w...
3,com.mantz_it.rfanalyzer,The bandwidth seemed to be limited to maximum ...
4,com.mantz_it.rfanalyzer,Works well with my Hackrf Hopefully new update...
...,...,...
288060,com.termux.api,it doesn't do anything after installing this i...
288061,com.termux.api,I like this app . Its is very helpful for use....
288062,com.termux.api,Finally Brings back the Unix command line to A...
288063,com.termux.api,The API feature is great just need loads more...


### Limpieza

In [32]:
def clean_corpus(corpus_review):
        
    for i in range(len(corpus_review)):        
        corpus_review[i] = re.sub(r'https?:\/\/.\S+', "", corpus_review[i]) 
        corpus_review[i] = re.sub(r'"', '', corpus_review[i]) 
        corpus_review[i] = re.sub(r'#', '', corpus_review[i]) 
        corpus_review[i] = re.sub(r'^RT[\s]+', '', corpus_review[i])

        Apos_dict={"'s":" is","n't":" not","'m":" am","'    ll":" will", 
               "'d":" would","'ve":" have","'re":" are"} #reemplazar apostrofes    
          
        for key,value in Apos_dict.items(): 
            if key in corpus_review[i]: 
                corpus_review[i]=corpus_review[i].replace(key,value) #reemplazar

        corpus_review[i] = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",corpus_review[i]) if s])
        corpus_review[i]=corpus_review[i].lower() #minuscula

        file=open("slang.txt","r") #jergas del ingles
        slang=file.read() 
          
        slang=slang.split('\n') 
          
        tweet_tokens= corpus_review[i].split() 
        slang_word=[] 
        meaning=[] 
          
        for line in slang: 
            temp=line.split("=") 
            slang_word.append(temp[0]) 
            meaning.append(temp[-1]) 
          
        for i,word in enumerate(tweet_tokens): 
            if word in slang_word: 
                idx=slang_word.index(word) 
                tweet_tokens[i]=meaning[idx]
        
        corpus_review[i]=" ".join(tweet_tokens) 
        corpus_review[i] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(corpus_review[i]))   
 
        spell = Speller(lang='en') 
        corpus_review[i]=spell(corpus_review[i]) 
    return corpus_review

### Se extrae en un corpus todos los reviews o criticas de usuario

In [34]:
def extract_corpus(dataset):
    print(datetime.today(), "extrayendo oraciones...")
    lista = []  
    for i in range(len(dataset['package_name'].unique())):#iterar entre los package_name unicos
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})#otener un package_name y el número de oraciones

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)#se ordena para saber que package_name tiene el mayor n° de oraciones
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]#el mayor será el elemnto que ocupa la posicion 0
    corpus = list(dataframe['review'])#extraemos un corpus
    
    print(datetime.today(), "limpiando el corpus...")
    corpus = clean_corpus(corpus)
    return corpus

### Convertir datos en un Dataframe a un manejo más ágil

In [23]:
def convert_corpus_to_dataFrame(corpus):
    print(datetime.today(), "Convirtiendo las oraciones extraidas a un dataframe...")
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

### Se crea una función que nos permita incrustar las oraciones, para esto usamos un modelo pre-entrenado de SBERT

In [24]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')#modelo pre-entrenado
    corpus = extract_corpus(dataset)#extraemos un corpus del dataset 
    print(datetime.today(), "Incrustando las oraciones...")
    embeddings = model_embedder.encode(corpus, 
                                        convert_to_tensor=False, 
                                        show_progress_bar=True) #generamos las incrustaciones 

    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True) #normalizamos

    return embeddings, corpus

### Para obtener el "mejor" cluster aplicamos el método de la silueta

In [25]:
#en esta funcion hace la tarea de obtener el mejor k con agglomerative clustering
def silhoutte(dataset, attempts):
    embeddings, corpus = neural_embeddings(dataset)
    print(datetime.today(), "Calculando el mejor k...")
    scores_silhouette = [] #guardaremos todos los resultados del método de la silueta para devolver el mayor

    for k in range(2,attempts+1):
        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, 
                                                            affinity="cosine" , 
                                                            linkage="complete").fit(embeddings)
                                                            
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings, corpus

### Al tener el "mejor" número de clusters, se procede a segmentar las oraciones

In [26]:
def topics_segmentation(dataset_review, attempts):
    n_clusters, embeddings, corpus = silhoutte(dataset_review, attempts) # se le pasa el mejor K

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, 
                                                        affinity="cosine", 
                                                        linkage="complete").fit(embeddings)
                                                        
    cluster_labels = agglomerative_clusterering.labels_ #obtengo las etiquetas respectivas a las oraciones

    
    dataframe = convert_corpus_to_dataFrame(corpus)
    dataframe['cluster'] = cluster_labels

    return dataframe

In [35]:
data = topics_segmentation(review, 40)

2022-08-22 10:41:01.721964 extrayendo oraciones...
2022-08-22 10:41:38.156747 limpiando el corpus...
2022-08-22 10:55:48.991027 Incrustando las oraciones...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

2022-08-22 10:57:59.346241 Calculando el mejor k...
2022-08-22 11:00:49.710302 Convirtiendo las oraciones extraidas a un dataframe...


In [36]:
data

Unnamed: 0,Sentences,cluster
0,veryold,1
1,عالی good,1
2,the best ever,1
3,awesome. love it 😜😜😁😁😜😁😁,1
4,i am feeling very well,1
...,...,...
2971,i am feeling very well,1
2972,does not work ca not add any accounts.,1
2973,only one concern but not sure if it was the ap...,0
2974,verygood,1


In [37]:
data['cluster'].unique()

array([1, 0], dtype=int64)