In [None]:

import pandas as pd
import torch
from tqdm.auto import tqdm
import time
from contextlib import contextmanager

In [None]:
items_df = pd.read_csv('/content/20191226-items.csv')
reviews_df = pd.read_csv('/content/reviews.csv')

In [None]:
# Check column names
print("Items DataFrame columns:", items_df.columns)
print("Reviews DataFrame columns:", reviews_df.columns)

Items DataFrame columns: Index(['asin', 'brand', 'title', 'url', 'image', 'rating', 'reviewUrl',
       'totalReviews', 'price', 'originalPrice'],
      dtype='object')
Reviews DataFrame columns: Index(['asin', 'name', 'rating', 'date', 'verified', 'title', 'body',
       'helpfulVotes'],
      dtype='object')


In [None]:
# Merge datasets based on a common key, for example 'item_id'
dataset_df = pd.merge(items_df, reviews_df, on='asin', how = 'inner')

In [None]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67986 entries, 0 to 67985
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   asin           67986 non-null  object 
 1   brand          67786 non-null  object 
 2   title_x        67986 non-null  object 
 3   url            67986 non-null  object 
 4   image          67986 non-null  object 
 5   rating_x       67986 non-null  float64
 6   reviewUrl      67986 non-null  object 
 7   totalReviews   67986 non-null  int64  
 8   price          67986 non-null  float64
 9   originalPrice  67986 non-null  float64
 10  name           67983 non-null  object 
 11  rating_y       67986 non-null  int64  
 12  date           67986 non-null  object 
 13  verified       67986 non-null  bool   
 14  title_y        67957 non-null  object 
 15  body           67960 non-null  object 
 16  helpfulVotes   27215 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(10)
memor

In [None]:
dataset_df.isna().sum()

asin                 0
brand              200
title_x              0
url                  0
image                0
rating_x             0
reviewUrl            0
totalReviews         0
price                0
originalPrice        0
name                 3
rating_y             0
date                 0
verified             0
title_y             29
body                26
helpfulVotes     40771
dtype: int64

In [None]:
dataset_df['brand'].unique()

array([nan, 'Motorola', 'Nokia', 'Samsung', 'HUAWEI', 'Sony', 'Apple',
       'Google', 'ASUS', 'OnePlus', 'Xiaomi'], dtype=object)

In [None]:
dataset_df['brand'].value_counts()

brand
Samsung     33629
Motorola     8880
Nokia        5915
Apple        5145
Xiaomi       4411
Google       3787
Sony         3196
HUAWEI       2225
OnePlus       347
ASUS          251
Name: count, dtype: int64

In [None]:
dataset_df[['asin', 'brand']]

Unnamed: 0,asin,brand
0,B0000SX2UC,
1,B0000SX2UC,
2,B0000SX2UC,
3,B0000SX2UC,
4,B0000SX2UC,
...,...,...
67981,B081H6STQQ,Sony
67982,B081H6STQQ,Sony
67983,B081H6STQQ,Sony
67984,B081TJFVCJ,Apple


In [None]:
# Check if 'asin' and 'brand' columns exist
if 'asin' in dataset_df.columns and 'brand' in dataset_df.columns:
    # Group by 'asin' and aggregate brands
    asin_brand_group = dataset_df.groupby('asin')['brand'].agg(['unique', 'nunique']).reset_index()
    asin_brand_group.columns = ['asin', 'unique_brands', 'num_unique_brands']

    # Display the results
    print("ASIN and their corresponding unique brands:")
    print(asin_brand_group)
else:
    print("The required columns 'asin' and 'brand' are not present in the dataset.")

ASIN and their corresponding unique brands:
           asin unique_brands  num_unique_brands
0    B0000SX2UC         [nan]                  0
1    B0009N5L7K    [Motorola]                  1
2    B000SKTZ0S    [Motorola]                  1
3    B001AO4OUC    [Motorola]                  1
4    B001DCJAJG    [Motorola]                  1
..          ...           ...                ...
715  B07ZPKZSSC       [Apple]                  1
716  B07ZQSGP53      [Xiaomi]                  1
717  B081H6STQQ        [Sony]                  1
718  B081TJFVCJ       [Apple]                  1
719  B0825BB7SG     [Samsung]                  1

[720 rows x 3 columns]


In [None]:
if 'asin' in dataset_df.columns and 'brand' in dataset_df.columns:
    # Update the brand name for the specific ASIN
    dataset_df.loc[dataset_df['asin'] == 'B0000SX2UC', 'brand'] = 'realme'

    # Verify the update
    updated_entry = dataset_df[dataset_df['asin'] == 'B0000SX2UC']
    print("Updated entry for ASIN 'B0000SX2UC':")
    print(updated_entry)
else:
    print("The required columns 'asin' and 'brand' are not present in the dataset.")

Updated entry for ASIN 'B0000SX2UC':
          asin   brand                                            title_x  \
0   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
1   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
2   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
3   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
4   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
5   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
6   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
7   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
8   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
9   B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
10  B0000SX2UC  realme  Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...   
11  B0000SX2UC  realme  Dual-Band / Tri

In [None]:
dataset_df['body'].duplicated().sum()/len(dataset_df)*100

12.024534462977671

In [None]:
dataset_df['body'].nunique(),dataset_df.shape[0]

(59810, 67986)

In [None]:
updated_entry

Unnamed: 0,asin,brand,title_x,url,image,rating_x,reviewUrl,totalReviews,price,originalPrice,name,rating_y,date,verified,title_y,body,helpfulVotes
0,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0
5,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,J. White,4,"September 25, 2005",False,Worked great for me,"Hello, I have this phone and used it until I d...",
6,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,the cell phone store owner,5,"April 16, 2004",False,Wanna cool Nokia? You have it here!,Cool. Cheap. Color: 3 words that describe the ...,2.0
7,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Matt,4,"April 3, 2004",False,Problem with 3588i universal headset,"The 3599i is overall a nice phone, except that...",2.0
8,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Charles Cook,5,"November 24, 2003",False,cool phone!!!!!!!!,"I've never owned a Nokia phone before, so this...",7.0
9,B0000SX2UC,realme,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Amazon Customer,3,"February 2, 2004",False,Pissed off-a little bit,ok well im in school and i need the text messa...,3.0


In [None]:
import tensorflow as tf
num_gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpus_available)



Num GPUs Available:  0


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
# Example function to perform sentiment analysis on the dataset
def analyze_sentiment(text):
    result = sentiment_pipeline(text)
    return result[0]['label'], result[0]['score']

In [None]:
# Quantize the model
model_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [None]:
 # Display the updated dataset with sentiment analysis
    #print("Dataset with sentiment analysis:")
    #print(reviews_df[['body', 'sentiment', 'sentiment_score']])
#else:
    #print("The required column 'body' is not present in the dataset.")

In [None]:
few_shot_examples = """
Review: The battery life of this phone is excellent, but the camera quality is poor.
Aspect: battery
Sentiment: positive

Review: The performance is great, but the design feels outdated.
Aspect: design
Sentiment: nutral

Review: The picture quality and performance is not good.
Aspect: design
Sentiment: negative

Review: The camera takes amazing pictures, but the battery drains quickly.
Aspect: camera
Sentiment: positive
"""

# List of aspects to analyze
aspects = ['battery', 'camera', 'performance', 'design']

In [None]:
!pip install openai



In [None]:
pip install openai==0.28



In [None]:
openai.api_key = 'sk-proj-uGI9OyHSuuKvT9xNT3G1T3BlbkFJ2xN3ppWRH91hnMdMZLs1'

In [None]:
# Define few-shot examples
few_shot_examples = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "The battery life of this phone is excellent, but the camera quality is poor."},
    {"role": "assistant", "content": "The sentiment for the aspect 'battery' is positive."}
]

def analyze_aspect_sentiment(review, aspect):
    # Add the current review to the few-shot examples
    messages = few_shot_examples + [
        {"role": "user", "content": f"Review: {review}"},
        {"role": "user", "content": f"Analyze the sentiment for the aspect '{aspect}'."}
    ]

    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=60
    )

    sentiment = completion.choices[0].message['content'].strip()
    return sentiment

review = "The battery life of this phone is excellent, but the camera quality is poor."
aspect = "battery"
sentiment = analyze_aspect_sentiment(review, aspect)
print(f"Sentiment for aspect '{aspect}': {sentiment}")

Sentiment for aspect 'battery': The sentiment for the aspect 'battery' is positive based on the review sentence provided.


In [None]:
pip install transformers




In [None]:
from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
def analyze_review_sentiments(reviews, aspect):
    sentiments = []
    for review in reviews:
        sentiment = analyze_aspect_sentiment(review, aspect)
        sentiments.append(sentiment_analyzer(sentiment)[0])

    return sentiments


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def compute_average_sentiment(sentiments):
    positive = sum(1 for s in sentiments if s['label'] == 'POSITIVE')
    neutral = sum(1 for s in sentiments if s['label'] == 'NEUTRAL')
    negative = sum(1 for s in sentiments if s['label'] == 'NEGATIVE')
    total = len(sentiments)

    return {
        "positive": positive / total,
        "neutral": neutral / total,
        "negative": negative / total
    }

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["positive ", "nutral", "negetive"],
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'sequence': 'This is a course about the Transformers library',
 'labels': ['nutral', 'positive ', 'negetive'],
 'scores': [0.6058266758918762, 0.22849544882774353, 0.16567789018154144]}

In [1]:
#add the dataframe
def add_sentiment_to_dataframe(reviews, aspect):
    data = []
    for review in reviews:
        sentiment = analyze_aspect_sentiment(review, aspect)
        sentiment_result = sentiment_analyzer(sentiment)[0]
        data.append({
            "review": review,
            "aspect": aspect,
            "sentiment_label": sentiment_result['label'],
            "sentiment_score": sentiment_result['score']
        })

    df = pd.DataFrame(updated_entry)
    return df

In [None]:
# Example usage
reviews = [
    "The battery life of this phone is excellent, but the camera quality is poor.",
    "I love the battery life, but the screen is not very clear.",
    "The battery performance is terrible, although the design is sleek."
]
aspect = "battery"
sentiments = analyze_review_sentiments(reviews, aspect)
average_sentiment_result = compute_average_sentiment(sentiments)

print(f"Average sentiment for aspect '{aspect}': {average_sentiment_result}")

Average sentiment for aspect 'battery': {'positive': 0.6666666666666666, 'neutral': 0.0, 'negative': 0.3333333333333333}


In [None]:
reviews = [
    "The battery life of this phone is excellent, but the camera quality is poor.",
    "I love the battery life, but the screen is not very clear.",
    "The battery performance is terrible, although the design is sleek."
]
aspect = "battery"
df = add_sentiment_to_dataframe(reviews, aspect)

print(f"Average sentiment for aspect '{aspect}': {average_sentiment_result}")
print(df)

Average sentiment for aspect 'battery': {'positive': 0.6666666666666666, 'neutral': 0.0, 'negative': 0.3333333333333333}
                                              review   aspect sentiment_label  \
0  The battery life of this phone is excellent, b...  battery        POSITIVE   
1  I love the battery life, but the screen is not...  battery        POSITIVE   
2  The battery performance is terrible, although ...  battery        NEGATIVE   

   sentiment_score  
0         0.999837  
1         0.999768  
2         0.999659  
