In [67]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
import pandas as pd
import umap
from bertopic import BERTopic

In [68]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [69]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [70]:
review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')

In [71]:
def extract_corpus(dataset):
    lista = []  
    for i in range(len(dataset['package_name'].unique())):
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]
    corpus = list(dataframe['review'])
    return corpus

In [72]:
dataframe = extract_corpus(review)
dataframe

['Authentication Morris',
 "I can't access my account We couldn't verify your Two-Factor authentication code. Please make sure you typed in the right code correctly. See this FAQ answer for more help or request a Two-Factor authentication removal here",
 'This app works fine Later found the secret key on a site  though it took a while.',
 'very good love it',
 'Powerful app Awesome app to safe my transaction',
 'M ishaq Good',
 'Great app',
 "Poor UI  no backup/restore I have 20 accounts setup  but there's no way for me to sort  group  or search them  so I have to manually scan a long list every time. There's no backup/restore  so moving to a new phone is a long  manual process.",
 "Help! I had it on my old phone and now it won't let me switch my athenticator accounts to my new phone...so I have to use my old phone to get my codes",
 'Loving Loving',
 'It takes to long I really hate it it takes to long so bored.',
 "Stop working after the last update I had sent an email to Google  but 

In [73]:
corpus_embeddings = embedder.encode(dataframe)
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [74]:
model = BERTopic(verbose=True)
topics, probabilities = model.fit_transform(dataframe)

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

2022-08-14 16:39:59,847 - BERTopic - Transformed documents to Embeddings
2022-08-14 16:40:31,130 - BERTopic - Reduced dimensionality
2022-08-14 16:40:31,554 - BERTopic - Clustered reduced embeddings


In [75]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,913
1,0,126
2,1,102
3,2,93
4,3,84
...,...,...
64,63,12
65,64,12
66,65,11
67,66,11


In [76]:
model.get_topic(39)

[('awesome', 1.5281259553587474),
 ('wera', 0.41389548842319035),
 ('true', 0.20694774421159517),
 ('title', 0.16247697881720338),
 ('read', 0.12567787204709066),
 ('love', 0.06468222971682806),
 ('its', 0.04553610096996222),
 ('it', 0.016077444619156854),
 ('the', 0.013243950639549907),
 ('', 1e-05)]

In [77]:
model.visualize_topics()

In [78]:
model.visualize_barchart()

In [79]:
model.visualize_heatmap()

In [80]:
def silhoutte(attempts):
    scores_silhouette = []

    for k in range(2,attempts):

        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, affinity="cosine" , linkage="complete").fit(corpus_embeddings)
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(corpus_embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters

In [81]:
n_clusters = silhoutte(40)
n_clusters

39

In [82]:
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine' , linkage='complete') #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [83]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

IndexError: list index out of range