In [24]:
from tensorflow.keras.datasets import reuters
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import time

import os
from dotenv import load_dotenv
import yaml
from utils import utils

## Import Sample data

In [2]:
df = pd.read_csv("./data/1.crypto_apps_reviews_raw.csv")

In [3]:
load_dotenv()
GEMINI_API_KEY=os.getenv("GEMINI_NEW")

## Import Prompts

In [4]:
with open("prompts/prompts.yaml") as file:
    try:
        prompts = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

## Import Category List

In [5]:
with open("data/1.crypto_category.txt", "r") as f:
    category_list = f.readlines()

In [6]:
category_list = category_list[0].split(",")

In [7]:
category_list

['Security',
 '  Usability/UI/UX',
 '  Transaction Fees/Speed',
 '  Customer Support',
 '  Features/Functionality',
 '  Account Management',
 '  Educational Resources/Onboarding',
 '  Wallet Security/Integration',
 '  Privacy',
 '  Reliability/Stability',
 '  Customer Service',
 '  Verification/KYC/AML Processes']

In [8]:
print("Number of categories: ", len(category_list))

Number of categories:  12


## Tag the entire dataset

In [26]:
import time

In [27]:
prompts["prompt_v9b"]

"You are an expert Customer Success Manager working in the {industry} Industry.You are tasked with categorizing a list of user reviews for further analysis. Please assign this review: {input_text}\nTo one of the following categories: {categories}If the review is just an expression of sentiment (eg: Great!, Bad!, etc). Please use the 'Generic feedback' category.\nYour answer should be a single category name."

In [28]:
test_category = category_list
perc = df.shape[0]//10
topics_gemini = []

In [38]:
start_time = time.time()
debug=True
n_samples= 20

if debug: 
    df_c = df.sample(n_samples).copy()
    reviews = df_c.content
else:
    df_c = df.copy()
    reviews = df_c.content
    
for i, review in enumerate(reviews):
    if debug:
        print("Tagging Review #", i+1)
    prompt2 = prompts["prompt_v9b"].format(industry="Crypto", 
                                                 categories=category_list,
                                                input_text = review )
    topic = utils.gemini_query(prompt2, gemini_key = GEMINI_API_KEY, debug=debug)
    topics_gemini.append(topic)
    if i%perc == 0:
        print(f"{i} out of {len(reviews)} done.")
    

end_time = time.time()
print(f"{i} reviews were processed in {(end_time-start_time)/60} minutes")

Tagging Review # 1
0 out of 20 done.
Tagging Review # 2
Tagging Review # 3
Tagging Review # 4
Tagging Review # 5
Tagging Review # 6
Tagging Review # 7
Tagging Review # 8
Tagging Review # 9
Tagging Review # 10
Tagging Review # 11
Tagging Review # 12
Tagging Review # 13
Tagging Review # 14
Tagging Review # 15
Tagging Review # 16
Tagging Review # 17
Tagging Review # 18
Tagging Review # 19
Tagging Review # 20
Gemini Failed to respond. Sleeping...
Entering recursive step. 1
19 reviews were processed in 0.9379808306694031 minutes


In [39]:
len(topics_gemini)

40

In [40]:
df_s = df_c.iloc[:n_samples,].copy()

In [41]:
df_s.loc[:, "gemini_llm_topic"] = topics_gemini

ValueError: Length of values (40) does not match length of index (20)

In [None]:
if debug:
    for row in range(df_s.shape[0]):
        print("App: ", df_s.iloc[row]["app"], " Tagged Topic: ", df_s.iloc[row]["gemini_llm_topic"])
        print("Review Text:")
        print()
        print(df_s.iloc[row]["content"])
        print("***************************")
        print()