In [88]:
from utils.utils import get_google_play_reviews
import numpy as np
import pandas as pd


from dotenv import load_dotenv
import os

import google.generativeai as genai


import yaml
from utils import utils

import time

In [89]:
load_dotenv()

True

**IMPORTANT**: you need to create a Google Gemini API key and save it in ".env" file located in the root directory.

In [90]:
GEMINI_API_KEY=os.getenv("GEMINI_NEW")

## Ingest reviews from Google Playstore

In [91]:
apps = ['com.binance.dev', 'co.mona.android', 'com.bybit.app']
crypto_com = apps[1]
bybit = apps[2]

In [92]:
n_samples=60
res, token = get_google_play_reviews(crypto_com, count=n_samples)
re2, token = get_google_play_reviews(bybit, count=n_samples)

In [93]:
df = pd.DataFrame(res)

## Generate List of Topics

In [94]:
with open("prompts/prompts.yaml") as file:
    try:
        prompts = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

In [95]:
prompt = prompts["prompt_v9a"].format(industry="Crypto")

In [96]:
print(prompt)

You are an expert Customer Success Manager working int the Crypto industry. Can you list the most common categories that could exist for a mobile app within this industry?Group similar categories into one, i.e: Customer Support and Customer Services should be grouped under Customer Support.Provide your answer as a list, eg: [category1, category2,..]


In [97]:
res = utils.gemini_query(prompt, gemini_key = GEMINI_API_KEY)

In [98]:
category_list = res.strip("[|]").split(",")

In [99]:
category_list

['Trading & Investing',
 ' Wallet & Custody',
 ' DeFi & Lending',
 ' NFTs & Collectibles',
 '  Analytics & Research',
 '  News & Information',
 '  Security & Compliance',
 '  Education & Onboarding',
 '  Community & Social',
 ' Customer Support',
 ' Regulatory Compliance Tools']

In [111]:
with open("data/1.crypto_category.txt", "r") as f:
    category_list = f.readlines()

In [118]:
category_list = [cat.strip() for cat in category_list[0].split(",")]

In [119]:
category_list

['Security',
 'Usability/UI/UX',
 'Transaction Fees/Speed',
 'Customer Support',
 'Features/Functionality',
 'Account Management',
 'Educational Resources/Onboarding',
 'Wallet Security/Integration',
 'Privacy',
 'Reliability/Stability',
 'Customer Service',
 'Verification/KYC/AML Processes']

## Tag reviews

**Note**: there is a **debug** variable that is currently set up to **True**. This will make the code tag a sample of 20 reviews. If you want to run this against the total reviews, change the vairable to **False**.

In [120]:
test_category = category_list
perc = df.shape[0]//10
topics_gemini = []

In [121]:
debug=True
start_time = time.time()
for i, review in enumerate(df.content):
    prompt2 = prompts["prompt_v9b"].format(industry="Crypto", 
                                                 categories=category_list,
                                                input_text = review )
    topic = utils.gemini_query(prompt2, gemini_key = GEMINI_API_KEY, debug=debug)
    topics_gemini.append(topic)
    if i%perc == 0:
        print(f"{i} out of {len(reviews)} done.")
    

end_time = time.time()
print(f"{i} reviews were processed in {(end_time-start_time)/60} minutes")

0 out of 30 done.
6 out of 30 done.
12 out of 30 done.
Gemini Failed to respond. Sleeping...
Entering recursive step. 1
18 out of 30 done.
24 out of 30 done.
30 out of 30 done.
Gemini Failed to respond. Sleeping...
Entering recursive step. 1
36 out of 30 done.
42 out of 30 done.
48 out of 30 done.
Gemini Failed to respond. Sleeping...
Entering recursive step. 1
Gemini Failed to respond. Sleeping...
Entering recursive step. 1
54 out of 30 done.
59 reviews were processed in 2.60248996814092 minutes


In [125]:
df["gemini_llm_topic"] = topics_gemini

In [126]:
df_o = df.tail(30)
df_n = df.head(30)

Lets look at a few reviews.

In [127]:
for row in range(df.head(10).shape[0]):
        print("App: ", crypto_com, " Tagged Topic: ", df.iloc[row]["gemini_llm_topic"])
        print("Review Text:")
        print()
        print(df_c.iloc[row]["content"])
        print("***************************")
        print()

App:  co.mona.android  Tagged Topic:  Generic feedback
Review Text:

the bugs seemed to have been fixed but pls make it possible to access customer service within the app
***************************

App:  co.mona.android  Tagged Topic:  Usability/UI/UX
Review Text:

can't unlock app
***************************

App:  co.mona.android  Tagged Topic:  Usability/UI/UX
Review Text:

selling people Info
***************************

App:  co.mona.android  Tagged Topic:  Generic feedback
Review Text:

Wild&crasy
***************************

App:  co.mona.android  Tagged Topic:  Features/Functionality
Review Text:

great app. easy to use for a beginner.
***************************

App:  co.mona.android  Tagged Topic:  Customer Support
Review Text:

I purchased a brand new crypto coin today, Walrus (WAL) but I'm not able to find a lot of crypto coins here for some reason unfortunately even though they are popular or brand new, newly released crypto coins.
***************************

App:  co.

## Train a neural net

In [128]:
from utils.utils import TopicNeuralNet

In [130]:
testmodel = TopicNeuralNet(df_o.content, df_o.gemini_llm_topic)

### With no preprocessing and no bootstraping

In [131]:
start = time.time()
testmodel.fit(bootstrapping=False, preprocess=False, epochs=20, factor=100)
end = time.time()
print("Elapsed time: ", np.round((end-start)/60,2),"minutes.")

Bootstrapping by a factor of 1
Normalizing text
Fitting text tokenizer
Fitting label tokenizer
Tokenizing text
Tokenizing labels
Passing text sequences
Setting model architecture
Compiled model
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 2.3024 - val_accuracy: 0.1538 - val_loss: 2.2996
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.7500 - loss: 2.2648 - val_accuracy: 0.1538 - val_loss: 2.2922
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.5000 - loss: 2.2516 - val_accuracy: 0.1154 - val_loss: 2.2853
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.5000 - loss: 2.1959 - val_accuracy: 0.1154 - val_loss: 2.2780
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.5000 - loss: 2.1275 - val_accuracy: 0.1154 - val_loss: 2.2668
Epoch 6/2

In [132]:
y_hat_old = testmodel.predict(df_o.content)
y_hat_new = testmodel.predict(df_o.content)

Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


### With preprocessing and no bootstraping

In [133]:
start = time.time()
testmodel.fit(bootstrapping=False, preprocess=True, epochs=20, factor=100)
end = time.time()
print("Elapsed time: ", np.round((end-start)/60,2),"minutes.")

Bootstrapping by a factor of 1
Normalizing text
Fitting text tokenizer
Fitting label tokenizer
Tokenizing text
Tokenizing labels
Passing text sequences
Setting model architecture
Compiled model
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 2.3044 - val_accuracy: 0.1923 - val_loss: 2.3020
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0000e+00 - loss: 2.3004 - val_accuracy: 0.0769 - val_loss: 2.3032
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2500 - loss: 2.2878 - val_accuracy: 0.1538 - val_loss: 2.3053
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.2500 - loss: 2.2823 - val_accuracy: 0.1538 - val_loss: 2.3082
Epoch 4: early stopping
Elapsed time:  0.06 minutes.


In [134]:
y_hat_old = testmodel.predict(df_o.content)
y_hat_new = testmodel.predict(df_o.content)

Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


### With preprocessing and bootstraping

In [135]:
start = time.time()
testmodel.fit(bootstrapping=True, preprocess=True, epochs=20, factor=100)
end = time.time()
print("Elapsed time: ", np.round((end-start)/60,2),"minutes.")

Bootstrapping by a factor of 100
Normalizing text
Fitting text tokenizer
Fitting label tokenizer
Tokenizing text
Tokenizing labels
Passing text sequences
Setting model architecture
Compiled model
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1000 - loss: 2.3024 - val_accuracy: 0.3090 - val_loss: 2.2785
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step - accuracy: 0.2667 - loss: 2.2778 - val_accuracy: 0.3090 - val_loss: 2.2392
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490ms/step - accuracy: 0.2867 - loss: 2.2414 - val_accuracy: 0.3055 - val_loss: 2.1961
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482ms/step - accuracy: 0.3000 - loss: 2.1989 - val_accuracy: 0.3055 - val_loss: 2.1548
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 530ms/step - accuracy: 0.3044 - loss: 2.1677 - val_accuracy: 0.3055 - val_loss: 2.1176
Epoch 6

In [136]:
y_hat_old = testmodel.predict(df_o.content)
y_hat_new = testmodel.predict(df_o.content)

Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


## Qualitative Eval

In [137]:
dummy_reviews = [
    "Great Customer support!","Bad customer service!", "Worst Customer support!",#customer support examples
    "Full of hidden fees","Loosing a lot of many with the fees", "Other networks charge less per trade",# fees
     "Too many updates!!", "Cant download app", #Reliability
    "App full of scammers","Danger, lot´s of scammers", "Scams!!", #security
     "Very Easy to use", "Friendly UI", "Fortunately I had no issues so far, the app is very intuitive", #usability
     "Nice features","Great so far", "Excellent", ":)", #generic feedback
     "Great app to loose money!"] #sarcasm
dummy_true_label = [
    "Customer Support", "Customer Support", "Customer Support", 
    "Transaction Fees/Speed","Transaction Fees/Speed","Transaction Fees/Speed",
    "Reliability/Stability", "Reliability/Stability",
    "Security","Security","Security",
    "Usability/UI/UX", "Usability/UI/UX", "Usability/UI/UX",
    "Generic feedback", "Generic feedback", "Generic feedback","Generic feedback",
    "Generic feedback"
]
    
dummy_predictions = testmodel.predict(dummy_reviews)
pd.DataFrame({"dummy_reviews":dummy_reviews, "dummy_true_labels":dummy_true_label,  "predictions":dummy_predictions})

Normalizing text
Tokenizing text
Passing text sequences
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step


Unnamed: 0,dummy_reviews,dummy_true_labels,predictions
0,Great Customer support!,Customer Support,Generic feedback
1,Bad customer service!,Customer Support,Generic feedback
2,Worst Customer support!,Customer Support,Generic feedback
3,Full of hidden fees,Transaction Fees/Speed,Generic feedback
4,Loosing a lot of many with the fees,Transaction Fees/Speed,Generic feedback
5,Other networks charge less per trade,Transaction Fees/Speed,Generic feedback
6,Too many updates!!,Reliability/Stability,Generic feedback
7,Cant download app,Reliability/Stability,Usability/UI/UX
8,App full of scammers,Security,Usability/UI/UX
9,"Danger, lot´s of scammers",Security,Generic feedback
