In [32]:
import os
import sys
import torch
import faiss
import argparse
import numpy as np
import pandas as pd
import torch.nn.functional as F
import nlpaug.augmenter.word as naw
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
tqdm.pandas()

from util_modeling import get_model_objects
from util_data import get_formatted_dataset, get_num_labels
from adaptive_methods import get_paraphrase_augmentations

In [10]:
datasets = get_formatted_dataset("boss_sentiment")
train_set = datasets["train"].to_pandas().drop(columns=["__index_level_0__"])
test_set = datasets["validation"].to_pandas().drop(columns=["__index_level_0__"])
display(train_set.head())
display(test_set.head())

Unnamed: 0,text,label
0,One of my favorites,1
1,My favorite Coarse Sea Salt brand I know about...,1
2,"Love the top! It fits a little tight, so can b...",1
3,very nice & I like it for everything I used it...,1
4,Awesome product!,1


Unnamed: 0,text,label
0,Great tool for usage with cutting dies!,1
1,Perfect,1
2,Good replacement ink for my epson printer,1
3,Excellent product. Works as designed/described...,1
4,The lint eater is amazing! We recently bought ...,1


## Create Embeddings

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-roberta-large")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-roberta-large").to(device).eval()

In [29]:
amazon_embeddings = torch.load("notebooks/dynasent_analysis/amazon_embeddings.pt")
train_set["embedding"] = amazon_embeddings.tolist()
train_set.head()

Unnamed: 0,text,label,embedding
0,One of my favorites,1,"[0.6473956108093262, 0.13781151175498962, 0.37..."
1,My favorite Coarse Sea Salt brand I know about...,1,"[0.8231244087219238, -0.1266566663980484, -0.0..."
2,"Love the top! It fits a little tight, so can b...",1,"[0.6754499673843384, 0.36946073174476624, -0.0..."
3,very nice & I like it for everything I used it...,1,"[0.8606024980545044, -0.2657198905944824, -0.5..."
4,Awesome product!,1,"[0.7306466102600098, -0.3295357823371887, 0.24..."


In [31]:
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    return model(**tokens)["pooler_output"].detach().cpu().numpy()

test_set["embedding"] = test_set["text"].progress_apply(get_embedding)
test_set.head()

100%|██████████| 38904/38904 [10:17<00:00, 63.00it/s]


Unnamed: 0,text,label,embedding
0,Great tool for usage with cutting dies!,1,"[[0.16313213, -0.49149835, -0.4596042, 0.33255..."
1,Perfect,1,"[[-0.50077945, -0.3658858, 0.36897424, -0.5260..."
2,Good replacement ink for my epson printer,1,"[[0.002162329, -0.42244336, -0.508097, 0.25507..."
3,Excellent product. Works as designed/described...,1,"[[0.5475646, -0.49577963, -0.4613782, 0.066582..."
4,The lint eater is amazing! We recently bought ...,1,"[[0.72862387, 0.39312103, -0.14261693, -0.2165..."


In [43]:
test_set_embeddings = torch.Tensor(np.stack(test_set["embedding"])).squeeze(1)
display(test_set_embeddings.shape)
torch.save(test_set_embeddings, "notebooks/dynasent_analysis/amazon_validation_embeddings.pt")

torch.Size([38904, 1024])

In [55]:
labels = train_set["label"].unique()
vector_stores = {}
centroids = {}
centroid_examples = {}
k = 10
d = 1024

for label in labels:
    label_instances = train_set[train_set["label"] == label]
    label_embeddings = np.stack(label_instances["embedding"].to_numpy()).astype(np.float32)
    
    faiss.normalize_L2(label_embeddings)
    vector_stores[label] = faiss.IndexFlatIP(d)
    vector_stores[label].add(label_embeddings)
    centroids[label] = label_embeddings.mean(axis=0)
    
    cosine_sims, centroid_example_indices = vector_stores[label].search(centroids[label].reshape(1, -1), k)
    centroid_examples[label] = []
    for index in centroid_example_indices[0]:
        centroid_examples[label].append(label_instances.iloc[index]["text"])

centroid_examples

{1: ['Really a very nice product. Works great and so easy to use, I really LOVE it!',
  'Great product and quality. I recommend this product.',
  'Really Great Product, soildly built, More than happy with perfomance, and I would ecomend this product to everyone who has a need....Its all Good!',
  'Excellent product, I love it.',
  'I like this product.',
  'good product. I would buy it again. Excellent',
  'very nice & I like it for everything I used it for! A+',
  'Great product, highly recommend.',
  'Works perfectly for what I need it for. Feels very well built. It also came with extra fuses which was a nice surprise. I would highly recommend this to anyone in need of such an item.',
  'Great product.'],
 2: ["A bit pricey for a piece of plastic for starters. I do like the feel of it and it does help me shoot more with ease. The complaints start with the poor casting on mine. I had to use a razor blade to shave excess material that rest on top of the buffer tube. It was an easy fix 

In [63]:
new_train_set_records = []
for inde, row in tqdm(train_set.iterrows(), total=len(train_set)):
    current_label = row["label"]
    current_text = row["text"]
    for example in centroid_examples[current_label]:
        new_train_set_records.append({"text": current_text, "label": example, "class": current_label})

rewrite_train_set = pd.DataFrame(new_train_set_records).sample(frac=1).reset_index(drop=True)
rewrite_train_set.to_csv("datasets/corrupted/boss_sentiment_train.csv", index=False)
display(rewrite_train_set)

new_test_set_records = []
for inde, row in tqdm(test_set.iterrows(), total=len(test_set)):
    current_label = row["label"]
    current_text = row["text"]
    for example in centroid_examples[current_label]:
        new_test_set_records.append({"text": current_text, "label": example, "class": current_label})

rewrite_test_set = pd.DataFrame(new_test_set_records).sample(frac=1).reset_index(drop=True)
rewrite_test_set.to_csv("datasets/corrupted/boss_sentiment_test.csv", index=False)
display(rewrite_test_set)

100%|██████████| 29999/29999 [00:01<00:00, 27024.60it/s]


Unnamed: 0,text,label,class
0,I wasn't expecting much... But These Things Ar...,Do not buy.. I've been pleased with the normal...,0
1,These post bases are a great product for keepi...,Really a very nice product. Works great and so...,1
2,It was ok for the money.,"The ends are fairly sharp, but some were dulle...",2
3,This game is simply ok. It's not a great follo...,The AmazonBasics calculator is decent but not ...,2
4,Hands down the best tasting boxed pasta out th...,"Great product, highly recommend.",1
...,...,...,...
299985,Installed with a new gasket/insulator kit boug...,Works perfectly for what I need it for. Feels ...,1
299986,If you are a homemade bread baker and you like...,"Great product, highly recommend.",1
299987,not vey good. just ok,I was expecting this to be a little easier tha...,2
299988,This is pretty much the worst toaster oven we ...,"SB has terrible customer service, Never return...",0


100%|██████████| 38904/38904 [00:01<00:00, 26713.20it/s]


Unnamed: 0,text,label,class
0,I use this every day I would recommend this fo...,"Excellent product, I love it.",1
1,This was a gift for a one year old. It's a gre...,"Really Great Product, soildly built, More than...",1
2,Good fit for the iPhone 6s Plus.,Works perfectly for what I need it for. Feels ...,1
3,Great starter kit. A good purchase and it's a ...,"Great product, highly recommend.",1
4,"Lovely shade of pink. Not your Grandma's pink,...",Great product.,1
...,...,...,...
389035,This product was researched by a co worker for...,"Excellent product, I love it.",1
389036,As described! My 4 year old loves Legos and pl...,"Really Great Product, soildly built, More than...",1
389037,They work awesome. I am on a well and at 45lbs...,I like this product.,1
389038,"Best battery I ever had, Kicks ass...",I like this product.,1
