In [144]:
import pandas as pd

data_set = pd.read_csv('../data/amazon_alexa.tsv', sep='\t')
data_set.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [145]:
data=data_set[['verified_reviews','feedback']]
data.columns = ['review','label']

data.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [146]:
# count occs of each label
label_counts = data.value_counts('label')


# get the nb of rows to drop from the majority class
rows_to_drop = label_counts.max()- label_counts.min()

#drop rows from the majority class
if rows_to_drop>0:
    data_majority = data[data['label']==1]
    data_balanced = data.drop(data_majority.sample(rows_to_drop).index)
else:
    data_balanced = data.copy()

#check the new class balance
print(data_balanced['label'].value_counts())


label
1    257
0    257
Name: count, dtype: int64


### data processing

In [147]:
import re

def clean_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
        return ""  # or return text if you want to leave non-string values unchanged
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove single chars
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)

    # Lowercase text
    text = text.lower()

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Trim leading and trailing space
    text = text.strip()

    return text


In [148]:
data_balanced.head()


Unnamed: 0,review,label
35,I bought this to compare the speaker quality t...,1
45,Very satisfied with the Echo. Alexa now contro...,1
46,"It's like Siri, in fact, Siri answers more acc...",0
62,Did t really know what I could use this device...,1
65,How easy if was to set up.,1


In [149]:
# extract review column as a list
reviews = data_balanced['review'].tolist()

#clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# add cleaned reviews as a new column to the dataframe
data_balanced['clean_review'] = cleaned_reviews



### data split

In [150]:
total_rows = len(data_balanced)
test_size = int(total_rows*0.95)

# randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

#get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

### sentinent w/ LLM


In [151]:
import pathlib
import textwrap

import google.generativeai as genai 


In [152]:
from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))




In [153]:
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

if GOOGLE_API_KEY is None:
    print("GOOGLE_API_KEY environment variable not set.")
else:
    genai.configure(api_key=GOOGLE_API_KEY)


In [154]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/learnlm-1.5-pro-experimental
models/gemini-exp-1114
models/gemini-exp-1121
models/gemini-exp-1206


In [155]:
model = genai.GenerativeModel('gemini-pro')

In [156]:
response= model.generate_content('What is the meaning of life?')
to_markdown(response.text)


> The question of the meaning of life has been pondered by philosophers, theologians, and scientists for centuries, and there is no one definitive answer that is universally accepted. However, some common themes that have emerged include:
> 
> * **Purpose:** Many people believe that life has a purpose or goal, whether it is to find happiness, make a difference in the world, or fulfill a specific destiny.
> * **Meaning through relationships:** Relationships with other people, including family, friends, and romantic partners, can provide a sense of purpose and meaning to life.
> * **Contribution:** Feeling like you are making a contribution to society or to others can give life a sense of meaning. This could involve working in a fulfilling career, volunteering, or simply being there for those you care about.
> * **Growth and learning:** Some people find meaning in the pursuit of knowledge, personal growth, and self-improvement.
> * **Experiences:** Life can be full of rich and rewarding experiences, such as travel, exploration, and the arts. These experiences can provide a sense of fulfillment and meaning.
> * **Legacy:** Some people find meaning in leaving a legacy or making an impact that will outlast them. This could involve raising a family, creating a work of art, or contributing to a cause that you believe in.
> 
> Ultimately, the meaning of life is unique to each individual and can change over time. It is a personal journey of discovery that can be both challenging and rewarding.

### single api call

In [157]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label']=''

test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
2753,"Works beautifully, excellent sound.",1,works beautifully excellent sound,
434,&#34;NEVER BUY CERTIFIED AND REFURBISHED ECHO ...,0,34 never buy certified and refurbished echo do...,
141,Not much features.,0,not much features,
868,BEST father's day gift. Dad joked to my mom th...,1,best father day gift dad joked to my mom that ...,
1277,ALEXA IS NOT AS SMART AS SIRI! CANNOT CANCEL S...,0,alexa is not as smart as siri cannot cancel sn...,
2368,It worked as a charmed.,1,it worked as charmed,
3135,I loved it does exactly what it says,1,loved it does exactly what it says,
954,Easy to setup. Love all the things she does.,1,easy to setup love all the things she does,
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,
2851,I didn't like that almost everytime i asked Al...,0,didn like that almost everytime asked alexa qu...,


### convert df to json using to_json() method 

In [158]:
json_data = test_set_sample[['clean_review','pred_label']].to_json(orient='records')
print(json_data)

[{"clean_review":"works beautifully excellent sound","pred_label":""},{"clean_review":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 echo dot this unit is very poor in receiving command and sometimes it does not even acknowledge the command never buy refurbished echo dot the unit received was probably never was certified and refurbished as indicated also bought new echo dot it works perfectly very disappointed","pred_label":""},{"clean_review":"not much features","pred_label":""},{"clean_review":"best father day gift dad joked to my mom that alexa will be the one listening to all of his repeated stories going forward","pred_label":""},{"clean_review":"alexa is not as smart as siri cannot cancel snooze after alarm goes off have to cancel pre set alarm for weekdays","pred_label":""},{"clean_review":"it worked as charmed","pred_label":""},{"clean_review":"loved it does exactly what it says","pred_label":""},{"clean_review":"easy to setup love al

In [159]:
prompt = f"""
You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
{json_data}
""" 
print(prompt)


You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
[{"clean_review":"works beautifully excellent sound","pred_label":""},{"clean_review":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 echo dot this unit is very poor in receiving command and sometimes it does not even acknowledge the command never buy refurbished echo dot the unit received was probably never was certified and refurbished as indicated also bought new echo dot it works perfectly very disappointed","pred_label":""},{"clean_review":"not much features","p

In [160]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_review":"works beautifully excellent sound","pred_label":1},{"clean_review":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 echo dot this unit is very poor in receiving command and sometimes it does not even acknowledge the command never buy refurbished echo dot the unit received was probably never was certified and refurbished as indicated also bought new echo dot it works perfectly very disappointed","pred_label":0},{"clean_review":"not much features","pred_label":0},{"clean_review":"best father day gift dad joked to my mom that alexa will be the one listening to all of his repeated stories going forward","pred_label":1},{"clean_review":"alexa is not as smart as siri cannot cancel snooze after alarm goes off have to cancel pre set alarm for weekdays","pred_label":0},{"clean_review":"it worked as charmed","pred_label":1},{"clean_review":"loved it does exactly what it says","pred_label":1},{"clean_review":"easy to setup love all t

In [161]:
import json

#clean the data by stripping the backticks
json_data = response.text.strip("`")

try:
    data_ = json.loads(json_data)
    df_sample = pd.DataFrame(data_)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    df_sample = pd.DataFrame()

df_sample

Unnamed: 0,clean_review,pred_label
0,works beautifully excellent sound,1
1,34 never buy certified and refurbished echo do...,0
2,not much features,0
3,best father day gift dad joked to my mom that ...,1
4,alexa is not as smart as siri cannot cancel sn...,0
5,it worked as charmed,1
6,loved it does exactly what it says,1
7,easy to setup love all the things she does,1
8,for the price the product is nice quality and ...,0
9,didn like that almost everytime asked alexa qu...,0


In [162]:
test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
2753,"Works beautifully, excellent sound.",1,works beautifully excellent sound,1
434,&#34;NEVER BUY CERTIFIED AND REFURBISHED ECHO ...,0,34 never buy certified and refurbished echo do...,0
141,Not much features.,0,not much features,0
868,BEST father's day gift. Dad joked to my mom th...,1,best father day gift dad joked to my mom that ...,1
1277,ALEXA IS NOT AS SMART AS SIRI! CANNOT CANCEL S...,0,alexa is not as smart as siri cannot cancel sn...,0
2368,It worked as a charmed.,1,it worked as charmed,1
3135,I loved it does exactly what it says,1,loved it does exactly what it says,1
954,Easy to setup. Love all the things she does.,1,easy to setup love all the things she does,1
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,0
2851,I didn't like that almost everytime i asked Al...,0,didn like that almost everytime asked alexa qu...,0


In [163]:
#plotting confusion matrix for prediction
from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample['pred_label']

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))



Confusion Matrix:
[[ 9  0]
 [ 1 10]]


In [164]:
import google.generativeai as genai

def get_completion(prompt, model="gemini-pro"):    
    # Create the model
    model = genai.GenerativeModel(model)
    
    # Generate the response
    response = model.generate_content(prompt)
    
    return response.text

In [165]:
#test
prompt = "Why is the sky blue?"

response = get_completion(prompt)
print(response)

The sky appears blue due to a phenomenon called **Rayleigh scattering**.

* **Sunlight consists of all colors:** When sunlight enters our atmosphere, it is composed of all colors in the visible spectrum.
* **Scattering by air molecules:** Air molecules (primarily nitrogen and oxygen) are much smaller than the wavelength of visible light. When sunlight hits these molecules, it scatters in all directions.
* **Shorter wavelengths scatter more:** The amount of scattering depends on the wavelength of light. Shorter wavelengths (blue and violet) scatter more than longer wavelengths (red and orange).
* **Blue light dominates:** Since blue light scatters more, it is scattered in all directions. Therefore, when we look up at the sky, we see more blue light than any other color.
* **Sun's position:** The sky is bluest at the zenith (directly overhead) because the sunlight has to travel the least amount of atmosphere to reach our eyes. Towards the horizon, more scattering occurs, resulting in a w

### Batching API Calls ( Single Shot)

In [166]:
test_set.shape

(488, 3)

In [167]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_review,pred_label
544,The echo dot disconnects from the speaker abou...,0,the echo dot disconnects from the speaker abou...,
361,This thing barely works. You have to select 3r...,0,this thing barely works you have to select 3rd...,
2972,Love it,1,love it,
566,The second one which was a refurbished model d...,0,the second one which was refurbished model did...,
155,I like the fact that I can get any kind of mus...,1,like the fact that can get any kind of music f...,
...,...,...,...,...
1414,"Prime day pricing was good, but the show is ve...",1,prime day pricing was good but the show is ver...,
469,Works great just like the new one that i alrea...,1,works great just like the new one that already...,
1112,,0,,
92,Good sound works well,1,good sound works well,


In [168]:
batches = []
batch_size = 25
for i in range(0, len(test_set_total), batch_size):
    batches.append(test_set_total[i : i + batch_size]) # append batches instead of assigning

In [None]:
import time
import json

def gemini_completion_function(batch, current_batch, total_batch):
    """Function works in three steps:
    # Step-1: Convert the DataFrame to JSON using the to_json() method.
    # Step-2: Preparing the Gemini Prompt
    # Step-3: Calling Gemini API
    """
    print(f"Now processing batch#: {current_batch+1} of {total_batch}")
    
    # Convert DataFrame to JSON
    json_data = batch[['clean_review','pred_label']].to_json(orient='records')
    
    prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
    Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
    Customer reviews are provided between three backticks below.
    In your output, only return the Json code back as output - which is provided between three backticks.
    Your task is to update predicted labels under 'pred_label' in the Json code.
    Don't make any changes to Json code format, please.
    Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
    
    ```
    {json_data}
    ```
    """
    
    try:
        # Generate content using Gemini
        response = model.generate_content(prompt)
        
        # Add delay to manage rate limits
        time.sleep(5)
        
        # Return the text response
        return response.text
    
    except Exception as e:
        print(f"Error in Gemini API call for batch {current_batch+1}: {e}")
        # Return original JSON data if API call fails
        return json_data

In [170]:
batch_count = len(batches)
responses = []

for i in range(0,batch_count):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 4
Now processing batch#: 2 of 4
Now processing batch#: 3 of 4
Now processing batch#: 4 of 4


In [None]:

df_total = pd.DataFrame()  # empty df

for response in responses:
    if isinstance(response, str):
        json_data = response.strip("`")
    else:
        json_data = response.text.strip("`")
    data = json.loads(json_data)
    
    df_temp = pd.DataFrame(data)
    
    df_total = pd.concat([df_total, df_temp], ignore_index=True)

display(df_total)

Unnamed: 0,clean_review,pred_label
0,the echo dot disconnects from the speaker abou...,0
1,this thing barely works you have to select 3rd...,0
2,love it,1
3,the second one which was refurbished model did...,0
4,like the fact that can get any kind of music f...,1
...,...,...
95,prime day pricing was good but the show is ver...,0
96,works great just like the new one that already...,1
97,,0
98,good sound works well,1


In [None]:

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_review,pred_label
544,The echo dot disconnects from the speaker abou...,0,the echo dot disconnects from the speaker abou...,0
361,This thing barely works. You have to select 3r...,0,this thing barely works you have to select 3rd...,0
2972,Love it,1,love it,1
566,The second one which was a refurbished model d...,0,the second one which was refurbished model did...,0
155,I like the fact that I can get any kind of mus...,1,like the fact that can get any kind of music f...,1
...,...,...,...,...
1414,"Prime day pricing was good, but the show is ve...",1,prime day pricing was good but the show is ver...,0
469,Works great just like the new one that i alrea...,1,works great just like the new one that already...,1
1112,,0,,0
92,Good sound works well,1,good sound works well,1


In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]


print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[52  1]
 [ 4 43]]

Accuracy: 0.95
