In [1]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
#from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
#from sklearn.preprocessing import normalize, LabelEncoder, StandardScaler, MinMaxScaler
#from sklearn.manifold import TSNE, LocallyLinearEmbedding, SpectralEmbedding
#from sklearn.decomposition import KernelPCA, SparsePCA, TruncatedSVD, PCA
#from matplotlib import pyplot as plt
#import matplotlib.cm as cm
#import seaborn as sns
import numpy as np
import pandas as pd
#import umap

In [2]:
review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')
review

Unnamed: 0,package_name,review,date,star
0,com.mantz_it.rfanalyzer,Great app! The new version now works on my Bra...,October 12 2016,4
1,com.mantz_it.rfanalyzer,Great It's not fully optimised and has some is...,August 23 2016,4
2,com.mantz_it.rfanalyzer,Works on a Nexus 6p I'm still messing around w...,August 04 2016,5
3,com.mantz_it.rfanalyzer,The bandwidth seemed to be limited to maximum ...,July 25 2016,3
4,com.mantz_it.rfanalyzer,Works well with my Hackrf Hopefully new update...,July 22 2016,5
...,...,...,...,...
288060,com.termux.api,it doesn't do anything after installing this i...,June 24 2016,3
288061,com.termux.api,I like this app . Its is very helpful for use....,June 20 2016,5
288062,com.termux.api,Finally Brings back the Unix command line to A...,May 20 2016,5
288063,com.termux.api,The API feature is great just need loads more...,May 05 2016,5


In [3]:
def extract_corpus(dataset):
    lista = []  
    for i in range(len(dataset['package_name'].unique())):
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]
    corpus = list(dataframe['review'])

    return corpus

In [4]:
def convert_corpus_to_dataFrame(corpus):
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

In [5]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')
    corpus = extract_corpus(dataset)

    corpus_embeddings = model_embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
    corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

    return corpus_embeddings

In [6]:
def silhoutte(dataset, attempts):

    embeddings = neural_embeddings(dataset)
    scores_silhouette = []

    for k in range(2,attempts):

        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, affinity="cosine" , linkage="complete").fit(embeddings)
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings

In [7]:
def segmentation(dataset_review, attempts):
    n_clusters, embeddings = silhoutte(dataset_review, attempts)

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, affinity="cosine" , linkage="complete").fit(embeddings)
    cluster_labels = agglomerative_clusterering.labels_

    return n_clusters, cluster_labels

In [8]:

def clustering(dataset_review, attempts):
    n_clusters, labels = segmentation(dataset_review, attempts)
    corpus_dataset = extract_corpus(dataset_review)

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(labels):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
    
        clustered_sentences[cluster_id].append(corpus_dataset[sentence_id])
    
    for i, cluster in clustered_sentences.items():
        print("Cluster ", i+1)
        print(cluster)
        print("     ")

In [9]:
clustering(review, 10)

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Cluster  1
['Authentication Morris', 'This app works fine Later found the secret key on a site  though it took a while.', "Doesn't work This app has different codes than the iphone version...", 'Does not work I wish I could give 0 stars   it just doesnt work. So frustrating.', 'Laziness causes security issues It shows all authentication codes simultaneously.', 'Fix this!!! Please fix the authenticator app problem I cant log in!', 'Keyword? When I add my account..it ask submit the provided key or scan the bar code...what is the Provided key & where can I get provided key...pls suggest.', "Barcode scanner not available right now when I updated I'm try a thousand time to reinstall but still can't use barcode scanner? Barcode scanner from zxing team already installed", "Needed 2FA  now user friendly Google authentication was always better for 2FA than receiving an SMS  but now it's actually easy to use with all IDs clearly marked  and legible.", "Help! When It say's Scan Barcode... There's