In [1]:
from tensorflow.keras.datasets import reuters
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import time

import os
from dotenv import load_dotenv
import yaml
from utils import utils
import time

## Import Sample data

In [2]:
df = pd.read_csv("./data/1.crypto_apps_reviews_raw.csv")

In [3]:
load_dotenv()
GEMINI_API_KEY=os.getenv("GEMINI_NEW")

## Import Prompts

In [4]:
with open("prompts/prompts.yaml") as file:
    try:
        prompts = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

## Import Category List

In [5]:
with open("data/1.crypto_category.txt", "r") as f:
    category_list = f.readlines()

In [6]:
category_list = category_list[0].split(",")

In [7]:
category_list

['Security',
 '  Usability/UI/UX',
 '  Transaction Fees/Speed',
 '  Customer Support',
 '  Features/Functionality',
 '  Account Management',
 '  Educational Resources/Onboarding',
 '  Wallet Security/Integration',
 '  Privacy',
 '  Reliability/Stability',
 '  Customer Service',
 '  Verification/KYC/AML Processes']

In [8]:
print("Number of categories: ", len(category_list))

Number of categories:  12


## Tagging an entire dataset

To tag a dataset, we will leverage the "category_list" that was previously generated and feed it to our second prompt.

Below is what the second prompt looks like.
Not that it contains 3 variables that need inputed:
* {industry}: the industry of the Company that the reviews belong to.
* {input_text}: these are the reviews. Its named "input_text" because this potently could be any type of input, not just reviews.
* {categories}: the categories to which the "input_text" needs to be assigned to.

In [9]:
print(prompts["prompt_v9b"])

You are an expert Customer Success Manager working in the {industry} Industry.You are tasked with categorizing a list of user reviews for further analysis. Please assign this review: {input_text}
To one of the following categories: {categories}If the review is just an expression of sentiment (eg: Great!, Bad!, etc). Please use the 'Generic feedback' category.
Your answer should be a single category name.


**Note**: there is a **debug** variable that is currently set up to **True**. This will make the code tag a sample of 20 reviews. If you want to run this against the total reviews, change the vairable to **False**.

In [10]:
debug=True
test_category = category_list
perc = df.shape[0]//10
topics_gemini = []

In [11]:
start_time = time.time()
n_samples= 20

if debug: 
    df_c = df.sample(n_samples).copy()
    reviews = df_c.content
else:
    df_c = df.copy()
    reviews = df_c.content
    
for i, review in enumerate(reviews):
    if debug:
        print("Tagging Review #", i+1)
    prompt2 = prompts["prompt_v9b"].format(industry="Crypto", 
                                                 categories=category_list,
                                                input_text = review )
    topic = utils.gemini_query(prompt2, gemini_key = GEMINI_API_KEY, debug=debug)
    topics_gemini.append(topic)
    if i%perc == 0:
        print(f"{i} out of {len(reviews)} done.")
    

end_time = time.time()
print(f"{i} reviews were processed in {(end_time-start_time)/60} minutes")

Tagging Review # 1
0 out of 20 done.
Tagging Review # 2
Tagging Review # 3
Tagging Review # 4
Tagging Review # 5
Tagging Review # 6
Tagging Review # 7
Tagging Review # 8
Tagging Review # 9
Tagging Review # 10
Tagging Review # 11
Tagging Review # 12
Tagging Review # 13
Tagging Review # 14
Tagging Review # 15
Tagging Review # 16
Tagging Review # 17
Tagging Review # 18
Tagging Review # 19
Tagging Review # 20
19 reviews were processed in 0.35275622208913165 minutes


The code above shows the recursive feature of the *gemini_query* function, this is the function that makes the call to the Gemini API.

If the API call failes, the function is set to:
* Wait 10 seconds
* Call itself (this is the recursive feature)


By the default, the function will call itself up to 3 tries.

If a call fails more than 3 times in a row it will return a "gemini failed" string.

In [12]:
len(topics_gemini)

20

In [13]:
df_c.loc[:, "gemini_llm_topic"] = topics_gemini

The code below will only run when the **debug** is set to **True**.

In [15]:
if debug:
    for row in range(df_c.shape[0]):
        print("App: ", df_c.iloc[row]["app"], " Tagged Topic: ", df_c.iloc[row]["gemini_llm_topic"])
        print("Review Text:")
        print()
        print(df_c.iloc[row]["content"])
        print("***************************")
        print()

App:  com.bybit.app  Tagged Topic:  Generic feedback
Review Text:

Nothing much to say
***************************

App:  com.bybit.app  Tagged Topic:  Wallet Security/Integration
Review Text:

Scam can't withdraw my crypto on my web3 wallet, when you try and confirm a transaction it does nothing
***************************

App:  com.bybit.app  Tagged Topic:  Generic feedback
Review Text:

good
***************************

App:  com.binance.dev  Tagged Topic:  Generic feedback
Review Text:

good
***************************

App:  com.binance.dev  Tagged Topic:  Generic feedback
Review Text:

Excellent
***************************

App:  com.bybit.app  Tagged Topic:  Verification/KYC/AML Processes
Review Text:

Verification got me stressed out of mind get bank statements and still u guys not accept it is crazy
***************************

App:  com.binance.dev  Tagged Topic:  Customer Support
Review Text:

good service
***************************

App:  co.mona.android  Tagged Topic:  C

## Summary

The full data set was already tagged when building the protype.

The output of the fully tagged dataset resides in the **"/data/2.crypto_app_reviews_tagged.csv"** file.
