In [145]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
import pandas as pd
import umap
from bertopic import BERTopic

In [146]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [147]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [148]:
review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')

In [149]:
def extract_corpus(dataset):
    lista = []  
    for i in range(len(dataset['package_name'].unique())):
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]
    corpus = list(dataframe['review'])
    return corpus

In [150]:
dataframe = extract_corpus(review)
dataframe

['Authentication Morris',
 "I can't access my account We couldn't verify your Two-Factor authentication code. Please make sure you typed in the right code correctly. See this FAQ answer for more help or request a Two-Factor authentication removal here",
 'This app works fine Later found the secret key on a site  though it took a while.',
 'very good love it',
 'Powerful app Awesome app to safe my transaction',
 'M ishaq Good',
 'Great app',
 "Poor UI  no backup/restore I have 20 accounts setup  but there's no way for me to sort  group  or search them  so I have to manually scan a long list every time. There's no backup/restore  so moving to a new phone is a long  manual process.",
 "Help! I had it on my old phone and now it won't let me switch my athenticator accounts to my new phone...so I have to use my old phone to get my codes",
 'Loving Loving',
 'It takes to long I really hate it it takes to long so bored.',
 "Stop working after the last update I had sent an email to Google  but 

In [151]:
corpus_embeddings = embedder.encode(dataframe)
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [154]:

scores_silhouette = []
for k in range(2,40+1):
    agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, affinity="cosine" , linkage="complete").fit(corpus_embeddings)
    cluster_labels = agglomerative_clusterering.labels_
    silhouette_avg = silhouette_score(corpus_embeddings, cluster_labels)
    scores_silhouette.append(silhouette_avg)

max_score = max(scores_silhouette)
max_index = scores_silhouette.index(max_score)

n_clusters = max_index + 2


In [155]:
clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine' , linkage='complete').fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
cluster_assignment

array([10,  2, 13, ..., 22,  7,  2], dtype=int64)

In [156]:
model = BERTopic(nr_topics=n_clusters).fit(dataframe)
model.generate_topic_labels()

['-1_to_my_the',
 '0_authenticator_google_authentication',
 '1_security_app_secure',
 '2_awesome_great_cool',
 '3_security_secure_feel',
 '4_dark_theme_back',
 '5_good_its_yes',
 '6_code_work_codes',
 '7_boy_dil_soo',
 '8_nice_aureliojuarezcortez_app',
 '9_ok_like_gd',
 '10_good_very_thanks',
 '11_easy_use_simple',
 '12_to_phone_new',
 '13_ui_space_much',
 '14_backup_no_que',
 '15_security_your_account',
 '16_accounts_reorder_the',
 '17_works_perfectly_just',
 '18_useful_helpful_usefull',
 '19_love_it_like',
 '20_download_work_sucks',
 '21_barcode_scanner_scan',
 '22_facebook_cant_help',
 '23_excellent_g3_lg',
 '24_app_love_this',
 '25_qr_code_scan',
 '26_good_app_application',
 '27_key_barcode_where',
 '28_phone_your_my',
 '29_2fa_mfa_the',
 '30_camera_permission_why',
 '31_working_work_not',
 '32_best_aps_aap',
 '33_scanner_barcode_install',
 '34_widget_steam_would',
 '35_verification_step_code',
 '36_loved_it_liked',
 '37_google_thanku_played',
 '38_good_lovr_one',
 '39_material_des

In [157]:
columns = {
    'Sentences': dataframe
}

df = pd.DataFrame(dataframe, columns=columns)
df['Clustering'] = cluster_assignment

print(df)

label = model.generate_topic_labels()
label.pop(0)

print(label)

list_ = []
for i in df['Clustering']:
    list_.append(label[i])

df['Topics'] = list_
df

                                              Sentences  Clustering
0                                 Authentication Morris          10
1     I can't access my account We couldn't verify y...           2
2     This app works fine Later found the secret key...          13
3                                     very good love it          24
4       Powerful app Awesome app to safe my transaction          30
...                                                 ...         ...
2971                              I'm feeling very well          17
2972              Doesn't work  can't add any accounts.           2
2973  Only one concern but not sure if it was the ap...          22
2974                                           Verygood           7
2975  This is very bad and not change password and v...           2

[2976 rows x 2 columns]
['0_authenticator_google_authentication', '1_security_app_secure', '2_awesome_great_cool', '3_security_secure_feel', '4_dark_theme_back', '5_good_its_yes', '6_

Unnamed: 0,Sentences,Clustering,Topics
0,Authentication Morris,10,10_good_very_thanks
1,I can't access my account We couldn't verify y...,2,2_awesome_great_cool
2,This app works fine Later found the secret key...,13,13_ui_space_much
3,very good love it,24,24_app_love_this
4,Powerful app Awesome app to safe my transaction,30,30_camera_permission_why
...,...,...,...
2971,I'm feeling very well,17,17_works_perfectly_just
2972,Doesn't work can't add any accounts.,2,2_awesome_great_cool
2973,Only one concern but not sure if it was the ap...,22,22_facebook_cant_help
2974,Verygood,7,7_boy_dil_soo
