### necessary imports 

In [206]:
import pandas as pd
import re
import pathlib
import textwrap
import google.generativeai as genai 
from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv
import os
import time
import json
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import resample


In [207]:

data_set = pd.read_csv('../data/amazon_alexa.tsv', sep='\t')

data_set.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [208]:
data=data_set[['verified_reviews','feedback']]
data.columns = ['review','label']

data.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [209]:
# count occs of each label
label_counts = data.value_counts('label')
# Split the data into minority and majority class
minority_class = data[data['label'] == 0]
majority_class = data[data['label'] == 1]

# Downsample majority class
majority_class_downsampled = resample(majority_class, 
                                      replace=False,     
                                      n_samples=len(minority_class),  # match minority class size
                                      random_state=42)   # for reproducibility

# Combine the balanced dataset
data_balanced = pd.concat([minority_class, majority_class_downsampled])
print(data_balanced['label'].value_counts())


label
0    257
1    257
Name: count, dtype: int64


### data processing

In [210]:
def clean_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
        return ""  # or return text if you want to leave non-string values unchanged
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove single chars
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip().lower()

    return text


In [211]:
data_balanced.head()


Unnamed: 0,review,label
46,"It's like Siri, in fact, Siri answers more acc...",0
111,Sound is terrible if u want good music too get...,0
141,Not much features.,0
162,"Stopped working after 2 weeks ,didn't follow c...",0
176,Sad joke. Worthless.,0


In [212]:
# extract review column as a list
reviews = data_balanced['review'].tolist()

#clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# add cleaned reviews as a new column to the dataframe
data_balanced['clean_review'] = cleaned_reviews



### data split

In [213]:
total_rows = len(data_balanced)
test_size = int(total_rows*0.95)

# randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

#get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

### sentinent w/ LLM


In [214]:
def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))




In [215]:
#API config 


load_dotenv()

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

if not GOOGLE_API_KEY:
    print("GOOGLE_API_KEY environment variable not set.")

genai.configure(api_key=GOOGLE_API_KEY)


In [216]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/learnlm-1.5-pro-experimental
models/gemini-exp-1114
models/gemini-exp-1121
models/gemini-exp-1206


In [217]:
model = genai.GenerativeModel('gemini-pro')

In [218]:
response= model.generate_content('What is the meaning of life?')
to_markdown(response.text)


> The meaning of life is a deeply personal and philosophical question that has been pondered by humans for centuries. There is no single, universally accepted answer, as different individuals may find meaning in various aspects of their existence. Some common perspectives on the meaning of life include:
> 
> **Purpose and Goals:** Some believe that life's meaning lies in fulfilling a predetermined purpose or achieving specific goals. This could entail pursuing a particular career, raising a family, or making significant contributions to society.
> 
> **Personal Fulfillment and Happiness:** Others emphasize the importance of personal fulfillment and happiness. They believe that the meaning of life is to live in accordance with one's values, passions, and interests, and to seek experiences that bring joy and satisfaction.
> 
> **Connection and Relationships:** Many find meaning in their relationships with others. They believe that life is about fostering meaningful connections with loved ones, friends, and the broader community.
> 
> **Growth and Learning:** Some view the meaning of life as a journey of continuous growth and learning. They believe that life is an opportunity to explore new experiences, acquire knowledge, and develop both personally and intellectually.
> 
> **Contribution to Something Greater:** Others believe that life's meaning lies in contributing to something greater than oneself. This could involve working towards social justice, environmental sustainability, or simply making a positive difference in the lives of others.
> 
> **Spirituality and Faith:** For some, the meaning of life is rooted in spirituality or faith. They believe that life is part of a larger cosmic or divine plan, and that their purpose is to connect with the sacred and live in harmony with the universe.
> 
> **Experiencing the Present Moment:** Many emphasize the importance of living in the present moment. They believe that meaning can be found in appreciating the everyday experiences, relationships, and beauty that surround us.
> 
> **Ultimately, the meaning of life is subjective and personal. There is no right or wrong answer, and it is up to each individual to discover what gives their life purpose, fulfillment, and significance.

### single api call

In [219]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label']=''

test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
1291,"item returned for repair ,receivded item back ...",0,item returned for repair receivded item back f...,
1015,The sound quality is good just wish alexa coul...,1,the sound quality is good just wish alexa coul...,
565,Worked like new. Will do it again.,1,worked like new will do it again,
1473,Do not understand all the excitement over thes...,0,do not understand all the excitement over thes...,
1200,Meh,0,meh,
387,It works great and sounds good!,1,it works great and sounds good,
1382,For the size of the screen this thing should b...,0,for the size of the screen this thing should b...,
1727,The tech isn’t there. It doesn’t understand mo...,0,the tech isn there it doesn understand most co...,
2964,Ask it to play Motown radio on Pandora and it ...,0,ask it to play motown radio on pandora and it ...,
2924,I jumped on the bandwagon by purchasing an Ech...,1,jumped on the bandwagon by purchasing an echo ...,


### convert df to json using to_json() method 

In [220]:
json_data = test_set_sample[['clean_review','pred_label']].to_json(orient='records')
print(json_data)

[{"clean_review":"item returned for repair receivded item back from repair 07 23 18 parts missing no power cord included please advise","pred_label":""},{"clean_review":"the sound quality is good just wish alexa could answer more questions","pred_label":""},{"clean_review":"worked like new will do it again","pred_label":""},{"clean_review":"do not understand all the excitement over these alexa devices or the google ones either it rarely gets what say correct does not control the dish receiver yes set it up correctly using the alexa app on my phone we got this for 129 which is good deal ti is definately not worth the 229 price though maybe in another month it will get better or either have one that is just deaf and dumb really can not recommend this item at the current time","pred_label":""},{"clean_review":"meh","pred_label":""},{"clean_review":"it works great and sounds good","pred_label":""},{"clean_review":"for the size of the screen this thing should be 25","pred_label":""},{"clean

In [221]:
prompt = f"""
You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
{json_data}
""" 
print(prompt)


You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
[{"clean_review":"item returned for repair receivded item back from repair 07 23 18 parts missing no power cord included please advise","pred_label":""},{"clean_review":"the sound quality is good just wish alexa could answer more questions","pred_label":""},{"clean_review":"worked like new will do it again","pred_label":""},{"clean_review":"do not understand all the excitement over these alexa devices or the google ones either it rarely gets what say correct does not control the dish receiver yes set it up co

In [222]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_review":"item returned for repair receivded item back from repair 07 23 18 parts missing no power cord included please advise","pred_label":0},{"clean_review":"the sound quality is good just wish alexa could answer more questions","pred_label":1},{"clean_review":"worked like new will do it again","pred_label":1},{"clean_review":"do not understand all the excitement over these alexa devices or the google ones either it rarely gets what say correct does not control the dish receiver yes set it up correctly using the alexa app on my phone we got this for 129 which is good deal ti is definately not worth the 229 price though maybe in another month it will get better or either have one that is just deaf and dumb really can not recommend this item at the current time","pred_label":0},{"clean_review":"meh","pred_label":0},{"clean_review":"it works great and sounds good","pred_label":1},{"clean_review":"for the size of the screen this thing should be 25","pred_label":0},{"clean_re

In [223]:

#clean the data by stripping the backticks
json_data = response.text.strip("`")

try:
    data_ = json.loads(json_data)
    df_sample = pd.DataFrame(data_)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    df_sample = pd.DataFrame()

df_sample

Unnamed: 0,clean_review,pred_label
0,item returned for repair receivded item back f...,0
1,the sound quality is good just wish alexa coul...,1
2,worked like new will do it again,1
3,do not understand all the excitement over thes...,0
4,meh,0
5,it works great and sounds good,1
6,for the size of the screen this thing should b...,0
7,the tech isn there it doesn understand most co...,0
8,ask it to play motown radio on pandora and it ...,0
9,jumped on the bandwagon by purchasing an echo ...,1


In [224]:
test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
1291,"item returned for repair ,receivded item back ...",0,item returned for repair receivded item back f...,0
1015,The sound quality is good just wish alexa coul...,1,the sound quality is good just wish alexa coul...,1
565,Worked like new. Will do it again.,1,worked like new will do it again,1
1473,Do not understand all the excitement over thes...,0,do not understand all the excitement over thes...,0
1200,Meh,0,meh,0
387,It works great and sounds good!,1,it works great and sounds good,1
1382,For the size of the screen this thing should b...,0,for the size of the screen this thing should b...,0
1727,The tech isn’t there. It doesn’t understand mo...,0,the tech isn there it doesn understand most co...,0
2964,Ask it to play Motown radio on Pandora and it ...,0,ask it to play motown radio on pandora and it ...,0
2924,I jumped on the bandwagon by purchasing an Ech...,1,jumped on the bandwagon by purchasing an echo ...,1


In [225]:

def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    print("Confusion Matrix:\n", cm)
    print(f"Accuracy: {accuracy}")

In [226]:

y_true = test_set_sample["label"]
y_pred = test_set_sample['pred_label']

evaluate_model(y_true, y_pred)



Confusion Matrix:
 [[11  1]
 [ 0  8]]
Accuracy: 0.95


In [227]:

def get_completion(prompt, model="gemini-pro"):    
    # Create the model
    model = genai.GenerativeModel(model)
    
    # Generate the response
    response = model.generate_content(prompt)
    
    return response.text

In [228]:
#test
prompt = "Why is the sky blue?"

response = get_completion(prompt)
to_markdown(response)

> The sky appears blue due to a phenomenon called Rayleigh scattering. Here's the explanation:
> 
> * **Sunlight:** Sunlight is composed of all colors of the visible spectrum (red, orange, yellow, green, blue, indigo, and violet).
> 
> * **Wavelengths and Scattering:** When sunlight enters the Earth's atmosphere, it interacts with molecules of nitrogen and oxygen. These molecules are much smaller than the wavelengths of visible light.
> 
> * **Rayleigh Scattering:** Due to the size difference, the shorter, blue wavelengths of light are scattered more effectively than the longer, red wavelengths. This effect is known as Rayleigh scattering.
> 
> * **Blue Dominance:** As sunlight passes through the atmosphere, the blue wavelengths are scattered in all directions. This scattered blue light reaches our eyes from every direction, making the sky appear blue.
> 
> * **Sunsets and Sunrises:** At sunrise and sunset, the sunlight has to travel through a greater thickness of the atmosphere to reach our eyes. This results in more scattering of blue light. The other colors, especially red and orange, are less scattered and reach our eyes directly, giving the sky its characteristic colors during these times.
> 
> In summary, the sky appears blue because the atmosphere's molecules scatter blue wavelengths of sunlight more effectively than other wavelengths.

### Batching API Calls ( Single Shot)

In [229]:
test_set.shape

(488, 3)

In [230]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_review,pred_label
1499,it does what it needs to.,1,it does what it needs to,
1487,Very disappointed. Alexa doesn’t hear me and ...,0,very disappointed alexa doesn hear me and does...,
2915,Great product - understands my accent!,1,great product understands my accent,
327,The echo doesn’t come with a built in battery ...,1,the echo doesn come with built in battery so i...,
2665,I am having real difficulty working with the E...,0,am having real difficulty working with the ech...,
...,...,...,...,...
74,She doesn’t always listen,1,she doesn always listen,
2745,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...,
1671,Easy to set up. Plug it in then make some sele...,1,easy to set up plug it in then make some selec...,
2162,Was loving it but starting in June Hulu stoppe...,0,was loving it but starting in june hulu stoppe...,


In [231]:
batches = []
batch_size = 25
for i in range(0, len(test_set_total), batch_size):
    batches.append(test_set_total[i : i + batch_size]) # append batches instead of assigning

In [232]:


def process_batch(batch):    
    # Convert DataFrame to JSON
    json_data = batch[['clean_review','pred_label']].to_json(orient='records')
    
    prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
    Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
    Customer reviews are provided between three backticks below.
    In your output, only return the Json code back as output - which is provided between three backticks.
    Your task is to update predicted labels under 'pred_label' in the Json code.
    Don't make any changes to Json code format, please.
    Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
    
    ```
    {json_data}
    ```
    """
    
    try:
        # Generate content using Gemini
        response = model.generate_content(prompt)
        
        # Add delay to manage rate limits
        time.sleep(5)
        
        # Return the text response
        return response.text.strip("`")   
    except Exception as e:
        print(f"Error : {e}")
        # Return original JSON data if API call fails
        return json_data
    


In [233]:
def process_batches(test_set, batch_size=25):
    batches = [test_set[i:i+batch_size]for i in range(0,len(test_set),batch_size)]
    responses = [process_batch(batch) for batch in batches]
    return pd.concat([pd.DataFrame(json.loads(response)) for response in responses], ignore_index=True)




In [None]:
df_total = process_batches(test_set_total)
test_set_total['pred_label']=df_total['pred_label'].values

In [None]:
y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]


print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[52  1]
 [ 4 43]]

Accuracy: 0.95
