In [2]:
import pandas as pd
import numpy as np
import time

import os

pd.set_option('display.max_colwidth', None)

from collections import Counter

from bertopic import BERTopic

## Import data

In [3]:
path = "./data/"

In [4]:
df = pd.read_csv(path+"/4.tagged_reviews_sample_verified.csv")

In [5]:
df["app"].unique()

array(['co.mona.android', 'com.bybit.app', 'com.binance.dev'],
      dtype=object)

In [6]:
binance_index = df["app"]== "com.binance.dev"
crypto_index = df["app"]== "co.mona.android"
bybit_index = df["app"]== 'com.bybit.app'

In [7]:
df_binance = df[binance_index].copy()
df_crypto = df[crypto_index].copy()
df_bybit = df[bybit_index].copy()

In [8]:
print("Binance # rows:", df_binance.shape[0])
print("Crypto.com # rows:", df_crypto.shape[0])
print("Bybit # rows:", df_bybit.shape[0])

Binance # rows: 88
Crypto.com # rows: 146
Bybit # rows: 65


## BERTopic

In [9]:
topic_model = BERTopic()

In [10]:
topics, probs = topic_model.fit_transform(df_crypto.content)

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


I run an LLM against a dataset and manually checked the labels. The LLM achieved an accuracy of 89% (basically, I agreed with the LLM´s topic assignment, 89% of the time).

Below is the topic distribution and topic titles that the LLM chose after prompting it.

In [13]:
df_crypto.gemini_llm_topic.value_counts().reset_index()

Unnamed: 0,gemini_llm_topic,count
0,Usability/UI/UX,31
1,Reliability/Stability,26
2,Transaction Fees/Speed,23
3,Customer Support,20
4,Account Management,19
5,Verification/KYC/AML Processes,11
6,Generic feedback,6
7,Features/Functionality,3
8,Educational Resources/Onboarding,3
9,Security,2


Below are the topics infered by the Bertopic model.

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,63,-1_the_to_it_and,"[the, to, it, and, my, you, app, for, in, of]","[Extremely frustrating! Won't even properly catch my photo.I.D the only time it will take the photo is at a ridiculous angle that does not capture the majority of the ID and then persist to mock you exclaiming that it cannot accept a cut off. I. D because of the location it demands you snap from. Please put in an actual usable button. So we can physically take the photo ourselves. The AI does not work! I attempted with multiple devices thinking that maybe it was because of my device but no., It has its pro's and con's. Used to be my favorite exchange, because if all the varieties of coins, the other top exchanges don't have. You also get a bank account and debit card with it so you can use your crypto for purchases and get your check deposited which I love. But the fees to sell or change coins are pretty steep, on coin base you can swap for free. Also I loved this app because you could make and sell nft's for free but the nft creator is still broken and it's been months., I am trying to open my crypto, BUT the app keeps locking me out... this app is useless. I guess I will open another app other than crypto. And there is no way to get help... I put in the code that they send me, and it won't take it.... AND now you tell me to live chat you???? HOW? I can't get on the app dummies]"
1,0,36,0_app_the_to_it,"[app, the, to, it, and, is, update, this, you, on]","[The app is still junk. The same problems for over two years. Trying to top up my debit card always requires me to shut down the app and restart. Don't ask me my phone make/model or tell me it's a problem with my phone ... It's been like that what different phones. The other nuisance is the forced updates... FFS give us a heads up beforehand. Don't just lock us out of the bloody app. It's so frustrating. And why does it need more than 2Gb of storage space to update with a 45Mb update? Grr, Well, the app is super slow, but at least it works again since the problematic update a while back. The constant ""must update to continue using the app"" is EXTREMELY annoying. I don't care if i miss 1 or 2 features, i just want to check on my crypto for once without being nagged to update. Now if an update made the app faster or better again, maybe I'd be interested. But nothing changes. This has been posted a long time, so from now on, every time I get nagged to update,this review loses a star., You know every time you do an update I keep thinking you're going to fix the app and you never do, if I leave it on in the background it errors, nothing responds, menus will not work at all, the UI doesn't function, it lags and freezes up constantly it's been going on for over a year. One thing you did right was the updates are updates now and no longer download the entire app instead of updating. Every time you update the app, the exact same thing, are you developers lazy, or just useless?]"
2,1,23,1_fees_and_the_to,"[fees, and, the, to, they, when, is, you, exchange, it]","[They literally scamming you. They show live prices of the tokens and when you sell they make it 10% less, but if you buy it they'll make 5% higher . It's a fraud , don't recommend anyone. And it's not coz if the market price change or any other factor, I had multiple such cases and every time on any exchange / market price was hig6and stable, but they make the swap/sell transaction with 10% lower rate., I'm a newcomer to crypto, and I'm extremely disappointed with the hidden fees. I made profits on all my crypto trades, but the fees were so astronomical I lost money. Not to mention high fees when I sent money to meta wallet for other purposes. I don't recommend this platform for traders., Horrible. When u buy the price is 20 cents higher, when u sell it's 15 cents lower. No fees? All hidden fees! They're scammers! They said the price is lower when u sell due to lack of liquidity! Always an issue, now I can't withdraw to my bank and they're telling me to update the app, sign out and sign in instead of actually doing their job and seeing what the issue it to fix it. The customer service guy I'm talking to for one hour about withdrawal issue is confused and lazy.]"
3,2,13,2_to_my_and_the,"[to, my, and, the, app, card, this, service, is, you]","[Was a good app until a little over a month ago when I tried to setup my fiat wallet. 30 plus days of an internal review and still no resolution. In case you're not familiar, you need fiat to withdraw your money. Asked to transfer my crypto to Coinbase or another platform but I was told I have to wait this long drawn out review of 2 documents with my SSN and License. Maybe I'll be able to with draw in the next 4 years. Just a reply to response to my review. That's who I've been in contact with., Horrible app. Slow and doesn't work properly. My credit card was declined and when I called my bank why my credit card wasn't working apparently crypto. Com has been trying to charge my credit card. Also, half of the time the app doesnt work at it tell me to contact customer service through the app. But I can't contact customer service since the app doesn't work. There are so many other crypto apps that works better and have better fees. Don't bother with this one. Scams and no help., Terrible customer service. Youre lucky to get a hold of support and then once you leave the convo, you can't go back. They denied service to my debit card out of nowhere. I've been using this app for over a year and have had a few problems along the way, and just recently they blocked my access to using their service via debit card, and that's the only way I have been able to use the app. So due to that I removed all of my holdings and replaced it with a place that actually accepts my money.]"
4,3,11,3_easy_very_use_to,"[easy, very, use, to, super, 2017, helpful, decent, amazing, fast]","[very easy to use, so simple to enter into the shares market, easy to use, Super easy to use, very helpful customer service.]"


In [10]:
import keras_hub

In [11]:
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

In [13]:
crypto_model = TopicNeuralNet(df_crypto.content, df_crypto.gemini_llm_topic)

In [15]:
crypto_model.fit(bootstrapping=False, preprocess=True, epochs=10, factor=100)

Bootstrapping by a factor of 1
Normalizing text
Fitting text tokenizer
Fitting label tokenizer
Tokenizing text
Tokenizing labels
Passing text sequences
Setting model architecture
Compiled model
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 2.4821 - val_accuracy: 0.1600 - val_loss: 2.4716
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.2381 - loss: 2.4404 - val_accuracy: 0.1680 - val_loss: 2.4537
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.2857 - loss: 2.3856 - val_accuracy: 0.1680 - val_loss: 2.4331
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step - accuracy: 0.1905 - loss: 2.3215 - val_accuracy: 0.1680 - val_loss: 2.4236
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.2381 - loss: 2.2210 - val_accuracy: 0.1680 - val_loss: 2.4263
Epoch

In [16]:
features = crypto_model.df.X
labels = crypto_model.label_sequences

In [18]:
num_classes = len(np.unique(crypto_model.label_sequences))

In [19]:
# Pretrained classifier.
classifier = keras_hub.models.AlbertClassifier.from_preset(
    "albert_base_en_uncased",
    num_classes=num_classes,
)

In [21]:
start = time.time()
classifier.fit(x=features, y=labels, batch_size=2)
end = time.time()
print("Trained on ", crypto_model.df.shape[0] , "observations")
print("Elapsed time: ", np.round((end-start)/60,2),"minutes.")

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2s/step - loss: 2.0138 - sparse_categorical_accuracy: 0.2641
Trained on  146 observations
Elapsed time:  2.85 minutes.


In [31]:
y_hat_pretrained = classifier.predict(df_crypto.content)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 12s/step


In [34]:
pred_list = []
for p in y_hat_pretrained:
   pred_list.append(p.argmax())
y_hats_decoded = crypto_model.le.inverse_transform(pred_list)

In [40]:
Counter(y_hats_decoded)

Counter({'Usability/UI/UX': 124,
         'Transaction Fees/Speed': 15,
         'Reliability/Stability': 6,
         'Verification/KYC/AML Processes': 1})

In [42]:
df_crypto.gemini_llm_topic.value_counts()

gemini_llm_topic
Usability/UI/UX                     31
Reliability/Stability               26
Transaction Fees/Speed              23
Customer Support                    20
Account Management                  19
Verification/KYC/AML Processes      11
Generic feedback                     6
Features/Functionality               3
Educational Resources/Onboarding     3
Security                             2
Wallet Security/Integration          1
Privacy                              1
Name: count, dtype: int64

In [45]:
(df_crypto.gemini_llm_topic == y_hats_decoded).mean()

0.3424657534246575