In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = "/content/drive/MyDrive/CSCML Final Project/AmazonReviewsUSApparel_5000.csv"
df5k = pd.read_csv(path)
df50k = pd.read_csv("/content/drive/MyDrive/CSCML Final Project/AmazonReviewsUSApparel.csv")

In [4]:
df50k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         50000 non-null  int64  
 1   marketplace        50000 non-null  object 
 2   customer_id        50000 non-null  int64  
 3   review_id          50000 non-null  object 
 4   product_id         50000 non-null  object 
 5   product_parent     50000 non-null  int64  
 6   product_title      50000 non-null  object 
 7   product_category   50000 non-null  object 
 8   star_rating        50000 non-null  float64
 9   helpful_votes      50000 non-null  float64
 10  total_votes        50000 non-null  float64
 11  vine               50000 non-null  object 
 12  verified_purchase  50000 non-null  object 
 13  review_headline    50000 non-null  object 
 14  review_body        50000 non-null  object 
 15  review_date        50000 non-null  object 
dtypes: float64(3), int64(3

In [5]:
# top 10 most reviewed products by poduct_id
checkvalues = df50k.groupby('product_id').size().reset_index().sort_values(by=[0],ascending=False)
print(checkvalues[:10])

       product_id   0
9090   B0045H0L1W  16
10051  B004M6UDC8  15
10050  B004M6UD46  13
10053  B004M6XUI2  13
13933  B006PGGJOE  13
28817  B00FBD2G0I  12
12265  B005GYGEWS  10
4829   B001PU9A9Q  10
29102  B00FJ5LI3O   9
19865  B009Y9QCCS   9


In [6]:
reviewById5k = pd.DataFrame(df5k.groupby('product_id').apply(lambda x: '. '.join(x['review_body'])), columns=['reviews'])
reviewById50k = pd.DataFrame(df50k.groupby('product_id').apply(lambda x: '. '.join(x['review_body'])), columns=['reviews'])

In [7]:
df50k.product_id.value_counts()

B0045H0L1W    16
B004M6UDC8    15
B006PGGJOE    13
B004M6XUI2    13
B004M6UD46    13
              ..
B00TCK9QNG     1
B001RNO46K     1
B00NB5RN88     1
B003E7OW4G     1
B009S59PS6     1
Name: product_id, Length: 46240, dtype: int64

In [8]:
reviewById50k

Unnamed: 0_level_0,reviews
product_id,Unnamed: 1_level_1
1608322254,"great product, easily donned, looks so cute, c..."
3979050432,Terrible! Like many other reviewers have state...
5555012349,My husband gets compliments every time he wear...
6094067339,Good for the price. Very usable for performance.
9773425614,Good quality.
...,...
B013BHFNQE,I bought this for my wife. She seem to enjoy ...
B013GZL4EQ,Color: The color is bright white and perfect o...
B013HBP5MQ,Much bigger than I thought a LG would be. It a...
B013U5OOYO,I purchased these for my husband and he has fo...


In [9]:
# df["text"] = df["review_headline"] + df["review_body"]

In [10]:
X_train_50k, X_test_50k = train_test_split(list(reviewById50k.reviews.values),test_size=0.2, random_state=123)
X_train_5k, X_test_5k = train_test_split(list(reviewById5k.reviews.values), test_size=0.2, random_state=123)

# Load & Train Model

In [11]:
%pip install git+https://github.com/MaartenGr/BERTopic huggingface_hub safetensors -qqq

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [12]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [13]:
representation_model_5k = KeyBERTInspired()
representation_model_50k = KeyBERTInspired()

In [14]:
topic_model_5k = BERTopic("english", verbose=True, nr_topics=40, representation_model=representation_model_5k)
topic_model_50k = BERTopic("english", verbose=True, nr_topics=40, representation_model=representation_model_50k)

In [15]:
topics_5k, probs_5k = topic_model_5k.fit_transform(X_train_5k)

2023-12-19 23:31:36,232 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-12-19 23:33:49,534 - BERTopic - Embedding - Completed ‚úì
2023-12-19 23:33:49,536 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-19 23:34:27,619 - BERTopic - Dimensionality - Completed ‚úì
2023-12-19 23:34:27,622 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-19 23:34:27,862 - BERTopic - Cluster - Completed ‚úì
2023-12-19 23:34:27,864 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-19 23:34:51,787 - BERTopic - Representation - Completed ‚úì
2023-12-19 23:34:51,790 - BERTopic - Topic reduction - Reducing number of topics
2023-12-19 23:35:07,935 - BERTopic - Topic reduction - Reduced number of topics from 67 to 40


In [16]:
topics_50k, probs_50k = topic_model_50k.fit_transform(X_train_50k)

2023-12-19 23:35:08,199 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1156 [00:00<?, ?it/s]

2023-12-19 23:55:53,755 - BERTopic - Embedding - Completed ‚úì
2023-12-19 23:55:53,758 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-19 23:57:09,076 - BERTopic - Dimensionality - Completed ‚úì
2023-12-19 23:57:09,080 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-19 23:57:14,555 - BERTopic - Cluster - Completed ‚úì
2023-12-19 23:57:14,559 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-19 23:58:37,708 - BERTopic - Representation - Completed ‚úì
2023-12-19 23:58:37,714 - BERTopic - Topic reduction - Reducing number of topics
2023-12-19 23:58:56,447 - BERTopic - Topic reduction - Reduced number of topics from 357 to 40


In [17]:
freq_5k = topic_model_5k.get_topic_info()
freq_5k.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1025,-1_size_wear_large_fit,"[size, wear, large, fit, waist, small, comfort...",[Got this for a cosplay I'm doing and it defin...
1,0,321,0_shirt_shirts_tshirt_tshirts,"[shirt, shirts, tshirt, tshirts, tee, washed, ...","[Love this shirt., I love this shirt!!!, I lik..."
2,1,277,1_dress_dresses_skirt_wear,"[dress, dresses, skirt, wear, robe, fabric, fi...","[I wanted to give this dress a great review, b..."
3,2,245,2_jeans_pants_denim_shorts,"[jeans, pants, denim, shorts, waist, wear, und...",[These jeans are VERY cozy (minus a couple itc...
4,3,198,3_socks_sock_shoes_boots,"[socks, sock, shoes, boots, comfortable, wear,...",[these socks are def thinner then I had expect...
5,4,192,4_jacket_jackets_sweater_coat,"[jacket, jackets, sweater, coat, sweatshirt, s...",[I purchase this sweater as a Christmas gift t...
6,5,179,5_fit_fits_great_comfortable,"[fit, fits, great, comfortable, perfectly, exc...","[Great fit, Great fit, Fits very good. I Like..."
7,6,140,6_hat_hats_cap_head,"[hat, hats, cap, head, brim, heads, wearing, w...",[the hat was more than what I expected I love ...
8,7,121,7_quality_pricey_durable_made,"[quality, pricey, durable, made, worth, excell...","[Very good quality., good quality for the pric..."
9,8,103,8_excellent_great_good_fantastic,"[excellent, great, good, fantastic, awesome, n...","[Excellent, Excellent, Excellent!!]"


In [61]:
freq_50k = topic_model_50k.get_topic_info()
freq_50k

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11068,-1_jeans_fit_wear_size,"[jeans, fit, wear, size, comfortable, shirt, s...",[I was nervous about buying this dress when I ...
1,0,10652,0_dress_bra_costume_wear,"[dress, bra, costume, wear, fit, comfortable, ...",[I ordered it on Saturday night and it was del...
2,1,4385,1_socks_sock_leggings_jeans,"[socks, sock, leggings, jeans, wear, shorts, p...",[I bought these because of the positive review...
3,2,1638,2_hat_hats_brim_cap,"[hat, hats, brim, cap, head, wear, made, wig, ...","[Great hat !, Very nice hat. Good quality hat...."
4,3,957,3_fit_fits_perfectly_thank,"[fit, fits, perfectly, thank, great, comfortab...","[Great fit, Great fit, great fit]"
5,4,921,4_excellent_good_great_very,"[excellent, good, great, very, fantastic, very...","[very good, Very good, perfect. very good]"
6,5,914,5_fabric_material_thin_cotton,"[fabric, material, thin, cotton, thick, soft, ...","[Right,but Fabric is thin. it is very nice i l..."
7,6,722,6_scarf_scarves_wear_neck,"[scarf, scarves, wear, neck, shawl, ties, gift...",[I absolutely love this scarf! it is so cute a...
8,7,689,7_quality_pricey_cheap_price,"[quality, pricey, cheap, price, worth, cheaply...","[Great quality, good price, Good quality and g..."
9,8,688,8_wallet_wallets_purse_pocket,"[wallet, wallets, purse, pocket, card, cards, ...",[I like it. Not sure if I need the clip for c...


# Test on different products

## 5k

In [19]:
test_topics_5k, test_prob_5k = topic_model_5k.transform(X_test_5k)

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2023-12-19 23:59:29,930 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2023-12-19 23:59:48,243 - BERTopic - Dimensionality - Completed ‚úì
2023-12-19 23:59:48,246 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2023-12-19 23:59:48,307 - BERTopic - Cluster - Completed ‚úì


In [20]:
# by each product
topic_collections = []
for example, topic in zip(X_test_5k, test_topics_5k):
    print(f"TEXT: {example}")
    topic_returned = topic_model_5k.get_topic_info(int(topic)).loc[0,'Representation']
    print(f"TOPIC: {topic_returned}")
    topic_collections += topic_returned
    topic_collections = list(set(topic_collections))
    print('--*--'*9)

TEXT: After waiting FOREVER, I received a size M instead of XS!!! The dress is way too big and not worth the hassle to return! Ill prob give it away :(
TOPIC: ['dress', 'dresses', 'skirt', 'wear', 'robe', 'fabric', 'fit', 'beautiful', 'wedding', 'comfortable']
--*----*----*----*----*----*----*----*----*--
TEXT: The size is right, and the print is nice. Nothing really to complain about, it is a T-shirt after all.
TOPIC: ['shirt', 'shirts', 'tshirt', 'tshirts', 'tee', 'washed', 'wear', 'great', 'love', 'cotton']
--*----*----*----*----*----*----*----*----*--
TEXT: I liked it very much so i reordered it!
TOPIC: ['gift', 'gifthe', 'loves', 'loved', 'thrilled', 'love', 'toys', 'christmas', 'great', 'xmas']
--*----*----*----*----*----*----*----*----*--
TEXT: ok, great for Bob Ross costume
TOPIC: ['wig', 'wigs', 'costume', 'hair', 'bangs', 'costumes', 'beard', 'ears', 'head', 'wear']
--*----*----*----*----*----*----*----*----*--
TEXT: This flag was exactly what I was looking for, it matched th

# 50k

In [21]:
test_topics_50k, test_prob_50k = topic_model_50k.transform(X_test_50k)

Batches:   0%|          | 0/289 [00:00<?, ?it/s]

2023-12-20 00:05:07,712 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
  self._set_arrayXarray(i, j, x)
2023-12-20 00:05:38,887 - BERTopic - Dimensionality - Completed ‚úì
2023-12-20 00:05:38,889 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2023-12-20 00:05:39,846 - BERTopic - Cluster - Completed ‚úì


In [22]:
# by each product
topic_collections_50k = []
for example, topic in zip(X_test_50k, test_topics_50k):
    print(f"TEXT: {example}")
    topic_returned = topic_model_50k.get_topic_info(int(topic)).loc[0,'Representation']
    print(f"TOPIC: {topic_returned}")
    topic_collections_50k += topic_returned
    topic_collections_50k = list(set(topic_collections_50k))
    print('--*--'*9)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TOPIC: ['jeans', 'fit', 'wear', 'size', 'comfortable', 'shirt', 'small', 'fits', 'fabric', 'large']
--*----*----*----*----*----*----*----*----*--
TEXT: Just don't use them a lot. I really like the fit and quality of these sock peds.
TOPIC: ['socks', 'sock', 'leggings', 'jeans', 'wear', 'shorts', 'pants', 'underwear', 'feet', 'wearing']
--*----*----*----*----*----*----*----*----*--
TEXT: I love the no iron,,,,just what I needed
TOPIC: ['ironing', 'iron', 'ironon', 'sew', 'fabric', 'ironbr', 'stitching', 'washed', 'cling', 'sleeve']
--*----*----*----*----*----*----*----*----*--
TEXT: These are great shorts. The material is nice. The colors, at least for my pair, are great. I like the cargo pockets for things like cell phones. The only problem I have with these shorts is that they are \\"relaxed\\" fit, and the web page does not indicate that.  I didn't find that out until I got them. Normally I wear a size 30 which is what 

## Palm

In [23]:
!pip install -q google-generativeai
import pprint
import google.generativeai as palm

In [24]:
palm.configure(api_key='AIzaSyCjluDc_N49pni5yzm2OKfG5bi2uHF1a2c')

In [25]:
models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
model = models[0].name
print(model)

models/text-bison-001


In [33]:
def response_PaLM2(prompt):
  completion = palm.generate_text(
    model=model,
    prompt=prompt,
    temperature=0,
    max_output_tokens=800,
  )
  print(completion.result)

  return completion.result



In [27]:
reviewDict5k = reviewById5k.sample(25).to_dict()['reviews']

In [45]:
zippedXtestTopics = list(zip(X_test_50k, test_topics_50k))
random.shuffle(zippedXtestTopics)
XtestTopicSm = zippedXtestTopics[:100]

In [51]:
instruction1 = "Extract the 5 topic from the review. Each topic should be a single word. Seperate each topic by comma."
topicsByPalm = []
for x in XtestTopicSm:
  print("Reviews: ", x[0])
  input = "You will be given a review and an instruction. Apply the instruction on the review." +\
            "The review is " + x[0] + " The instruction is " + instruction1 + "."
  # print(type(prompt))
  response = response_PaLM2(input)
  topicsByPalm.append(response)
  print("-----------------------------------\n")

Reviews:  This same, and I honestly didnt even know where I had ordered it from because it is a SHIRT not a tunic.<br /><br />I am 5'2 and 115lbs. and quite petite and there is no was I could wear this as the model is wearing it in the photo.<br /><br />It doesn't even come close to covering half my butt. (which isnt very big I must add). After waiting FOREVER, I received a size M instead of XS!!! The dress is way too big and not worth the hassle to return! Ill prob give it away :(
None
-----------------------------------

Reviews:  I love the dress but it runs small for a 14.  I bought another size 14 and it fit perfectly. I am keeping the dress because I am in the process of losing weight so I intend to fit it soon.
fit, love, size, weight, dress
-----------------------------------

Reviews:  very happy with this buy, shorts fit better than expected and are very comfortable, overall I'd say they were worth the price and I'll probably be buying more from this seller... cept maybe more

# PalM2 Transform back

## Bertopics + Palm2


In [29]:
!pip install -U sentence-transformers



In [40]:
import random

In [30]:
from sentence_transformers import SentenceTransformer, util

sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [59]:
instruction2 = "Take the review as the reference, generate a short text (less than 30 words) based on the topics. "
cos_scores = []
result = []
for example, topic in XtestTopicSm:
  temp = {}
  print("Reviews: ", example)
  temp['reviews'] = example
  topic_returned = topic_model_50k.get_topic_info(int(topic)).loc[0,'Representation']
  temp['topics'] = topic_returned
  print("Topics: ", topic_returned)
  prompt = "You will be given a list of topics and an instruction. Apply the instruction on the topics." +\
            "The review is " + example +\
            " The topics are " + ", ".join(t for t in topic_returned) + ". The instruction is " + instruction2 + ". "
  response = response_PaLM2(prompt)

  embedding_1= sent_model.encode(example, convert_to_tensor=True)
  if response == None:
    response = ""
  embedding_2 = sent_model.encode(response, convert_to_tensor=True)

  score = util.pytorch_cos_sim(embedding_1, embedding_2)
  temp['score'] = score
  cos_scores.append(score)
  result.append(temp)
  print(score)
  print("-----------------------------------\n")


Reviews:  This same, and I honestly didnt even know where I had ordered it from because it is a SHIRT not a tunic.<br /><br />I am 5'2 and 115lbs. and quite petite and there is no was I could wear this as the model is wearing it in the photo.<br /><br />It doesn't even come close to covering half my butt. (which isnt very big I must add). After waiting FOREVER, I received a size M instead of XS!!! The dress is way too big and not worth the hassle to return! Ill prob give it away :(
Topics:  ['jeans', 'fit', 'wear', 'size', 'comfortable', 'shirt', 'small', 'fits', 'fabric', 'large']
None
tensor([[0.0286]])
-----------------------------------

Reviews:  I love the dress but it runs small for a 14.  I bought another size 14 and it fit perfectly. I am keeping the dress because I am in the process of losing weight so I intend to fit it soon.
Topics:  ['dress', 'bra', 'costume', 'wear', 'fit', 'comfortable', 'shirt', 'fabric', 'size', 'medium']
14 size dress is too small but comfortable to w

In [60]:
sum(cos_scores)/len(cos_scores)

tensor([[0.4511]])

## All Palm2

In [52]:
reviewsSm = [x[0] for x in XtestTopicSm]

In [57]:
instruction2 = "Take the review as the reference, generate a short text (less than 30 words) based on the topics. "
cos_scores_allpalm = []
results_allpalm = []
for example, topic in zip(reviewsSm, topicsByPalm):
  temp = {}
  print("Reviews: ", example)
  temp['reviews'] = example
  temp['topics'] = topic
  print("Topics: ", topic)
  prompt = "You will be given a list of topics and an instruction. Apply the instruction on the topics." +\
            "The review is " + example +\
            " The topics are " + str(topic) + ". The instruction is " + instruction2 + ". "
  response = response_PaLM2(prompt)

  embedding_1= sent_model.encode(example, convert_to_tensor=True)
  if response == None:
    response = ""
  embedding_2 = sent_model.encode(response, convert_to_tensor=True)

  score = util.pytorch_cos_sim(embedding_1, embedding_2)
  temp['score'] = score
  cos_scores_allpalm.append(score)
  results_allpalm.append(temp)
  print(score)
  print("-----------------------------------\n")


Reviews:  This same, and I honestly didnt even know where I had ordered it from because it is a SHIRT not a tunic.<br /><br />I am 5'2 and 115lbs. and quite petite and there is no was I could wear this as the model is wearing it in the photo.<br /><br />It doesn't even come close to covering half my butt. (which isnt very big I must add). After waiting FOREVER, I received a size M instead of XS!!! The dress is way too big and not worth the hassle to return! Ill prob give it away :(
Topics:  None
None
tensor([[0.0286]])
-----------------------------------

Reviews:  I love the dress but it runs small for a 14.  I bought another size 14 and it fit perfectly. I am keeping the dress because I am in the process of losing weight so I intend to fit it soon.
Topics:  fit, love, size, weight, dress
love the dress but it runs small for a 14. 
tensor([[0.9007]])
-----------------------------------

Reviews:  very happy with this buy, shorts fit better than expected and are very comfortable, overa

In [58]:
sum(cos_scores_allpalm) / len(cos_scores_allpalm)

tensor([[0.6718]])