In [1]:
import pandas as pd

data_set = pd.read_csv('../data/amazon_alexa.tsv', sep='\t')
data_set.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [2]:
data=data_set[['verified_reviews','feedback']]
data.columns = ['review','label']

data.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [3]:
# count occs of each label
label_counts = data.value_counts('label')


# get the nb of rows to drop from the majority class
rows_to_drop = label_counts.max()- label_counts.min()

#drop rows from the majority class
if rows_to_drop>0:
    data_majority = data[data['label']==1]
    data_balanced = data.drop(data_majority.sample(rows_to_drop).index)
else:
    data_balanced = data.copy()

#check the new class balance
print(data_balanced['label'].value_counts())


label
1    257
0    257
Name: count, dtype: int64


### data processing

In [4]:
import re

def clean_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
        return ""  # or return text if you want to leave non-string values unchanged
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove single chars
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)

    # Lowercase text
    text = text.lower()

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Trim leading and trailing space
    text = text.strip()

    return text


In [5]:
data_balanced.head()


Unnamed: 0,review,label
3,I have had a lot of fun with this thing. My 4 ...,1
13,"Love, Love, Love!!",1
24,"I got a second unit for the bedroom, I was exp...",1
46,"It's like Siri, in fact, Siri answers more acc...",0
49,No different than Apple. To play a specific li...,1


In [6]:
# extract review column as a list
reviews = data_balanced['review'].tolist()

#clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# add cleaned reviews as a new column to the dataframe
data_balanced['clean_review'] = cleaned_reviews



### data split

In [7]:
total_rows = len(data_balanced)
test_size = int(total_rows*0.95)

# randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

#get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

### sentinent w/ LLM


In [8]:
import pathlib
import textwrap

import google.generativeai as genai 


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))




In [10]:
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

if GOOGLE_API_KEY is None:
    print("GOOGLE_API_KEY environment variable not set.")
else:
    genai.configure(api_key=GOOGLE_API_KEY)


In [11]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/learnlm-1.5-pro-experimental
models/gemini-exp-1114
models/gemini-exp-1121


In [12]:
model = genai.GenerativeModel('gemini-pro')

In [13]:
response= model.generate_content('What is the meaning of life?')
to_markdown(response.text)


> The meaning of life is a fundamental question that has been contemplated by philosophers, theologians, and individuals throughout history. There is no one definitive answer to this question, as it is a deeply personal and subjective matter. However, some common themes that emerge in discussions about the meaning of life include:
> 
> * **Purpose:** Many people believe that life has a purpose or mission, whether it is to fulfill a certain role, contribute to society, or achieve personal goals.
> * **Meaningful relationships:** Others find meaning in their relationships with family, friends, and loved ones.
> * **Pursuit of knowledge and enlightenment:** Some people believe that the meaning of life lies in the pursuit of knowledge, wisdom, and understanding.
> * **Making a difference:** Many find meaning in making a positive contribution to the world, whether through their work, activism, or simply by being kind and compassionate to others.
> * **Self-discovery and personal growth:** For some, the meaning of life lies in the journey of self-discovery and personal growth, as they strive to become the best version of themselves.
> 
> Ultimately, the meaning of life is a question that each individual must answer for themselves. There is no right or wrong answer, and what is meaningful to one person may not be meaningful to another. However, by reflecting on our own values, beliefs, and experiences, we can all strive to find meaning and purpose in our lives.

### single api call

In [14]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label']=''

test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,
1562,"I like everything, about this ! Im still learn...",1,like everything about this im still learning h...,
491,I've already returned it.,0,ve already returned it,
1188,Easy set up and to use. I bought two one for m...,1,easy set up and to use bought two one for my h...,
220,Cool,1,cool,
2073,,1,,
369,not working,0,not working,
2305,Had to return it as Hulu does not work on devi...,0,had to return it as hulu does not work on devi...,
544,The echo dot disconnects from the speaker abou...,0,the echo dot disconnects from the speaker abou...,
3132,It works great!!,1,it works great,


### convert df to json using to_json() method 

In [15]:
json_data = test_set_sample[['clean_review','pred_label']].to_json(orient='records')
print(json_data)

[{"clean_review":"bought this for myself and didn realize it had to be plugged in just to work it isn convenient to make them always have to connect to wall it should ve been more clear when bought it","pred_label":""},{"clean_review":"like everything about this im still learning how to use it","pred_label":""},{"clean_review":"ve already returned it","pred_label":""},{"clean_review":"easy set up and to use bought two one for my house and one for my 75 year old grandfather set it up for him gave him 15 minute lesson and he calls me at least once day it is great to be able to see him in between visits","pred_label":""},{"clean_review":"cool","pred_label":""},{"clean_review":"","pred_label":""},{"clean_review":"not working","pred_label":""},{"clean_review":"had to return it as hulu does not work on device it kept coming up with errors and hulu app not being able to open at times it was hit and miss and at times hulu will stop working right in the middle of show or movie plus fire stick i

In [16]:
prompt = f"""
You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
{json_data}
""" 
print(prompt)


You are an expert linguier, who is good at classifying customer review sentinence into Positive/Negative. 
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks. 
Your task is to update predicted labels under 'pred_label' in the Json code. 
Don't make any changes to Json code format, please. 


```
[{"clean_review":"bought this for myself and didn realize it had to be plugged in just to work it isn convenient to make them always have to connect to wall it should ve been more clear when bought it","pred_label":""},{"clean_review":"like everything about this im still learning how to use it","pred_label":""},{"clean_review":"ve already returned it","pred_label":""},{"clean_review":"easy set up and to use bought two one for my house and one for my 75 year old grandfather set it up for him gave him 15 minute

In [17]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_review":"bought this for myself and didn realize it had to be plugged in just to work it isn convenient to make them always have to connect to wall it should ve been more clear when bought it","pred_label":0},{"clean_review":"like everything about this im still learning how to use it","pred_label":1},{"clean_review":"ve already returned it","pred_label":0},{"clean_review":"easy set up and to use bought two one for my house and one for my 75 year old grandfather set it up for him gave him 15 minute lesson and he calls me at least once day it is great to be able to see him in between visits","pred_label":1},{"clean_review":"cool","pred_label":1},{"clean_review":"","pred_label":0},{"clean_review":"not working","pred_label":0},{"clean_review":"had to return it as hulu does not work on device it kept coming up with errors and hulu app not being able to open at times it was hit and miss and at times hulu will stop working right in the middle of show or movie plus fire stick is v

In [18]:
import json

#clean the data by stripping the backticks
json_data = response.text.strip("`")

#load the cleaned data and convert to Df
data_ = json.loads(json_data)
df_sample = pd.DataFrame(data_)

df_sample

Unnamed: 0,clean_review,pred_label
0,bought this for myself and didn realize it had...,0
1,like everything about this im still learning h...,1
2,ve already returned it,0
3,easy set up and to use bought two one for my h...,1
4,cool,1
5,,0
6,not working,0
7,had to return it as hulu does not work on devi...,0
8,the echo dot disconnects from the speaker abou...,0
9,it works great,1


In [19]:
test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_review,pred_label
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,0
1562,"I like everything, about this ! Im still learn...",1,like everything about this im still learning h...,1
491,I've already returned it.,0,ve already returned it,0
1188,Easy set up and to use. I bought two one for m...,1,easy set up and to use bought two one for my h...,1
220,Cool,1,cool,1
2073,,1,,0
369,not working,0,not working,0
2305,Had to return it as Hulu does not work on devi...,0,had to return it as hulu does not work on devi...,0
544,The echo dot disconnects from the speaker abou...,0,the echo dot disconnects from the speaker abou...,0
3132,It works great!!,1,it works great,1


In [20]:
#plotting confusion matrix for prediction
from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample['pred_label']

confusion_matrix(y_true, y_pred)

array([[ 9,  0],
       [ 1, 10]])

### openai config

In [21]:
import openai

OPEN_AI = os.getenv("OPEN_AI")
openai.api_key = OPEN_AI


In [22]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)

  return response.choices[0].message["content"]

In [23]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.