In [None]:
import os
import sys
import torch
import faiss
import argparse
import numpy as np
import pandas as pd
import torch.nn.functional as F
import nlpaug.augmenter.word as naw
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
tqdm.pandas()

from util_modeling import get_model_objects
from util_data import get_formatted_dataset, get_num_labels
from adaptive_methods import get_paraphrase_augmentations

In [None]:
datasets = get_formatted_dataset("boss_sentiment")
train_set = datasets["train"].to_pandas().drop(columns=["__index_level_0__"])
test_set = datasets["validation"].to_pandas().drop(columns=["__index_level_0__"])
display(train_set.head())
display(test_set.head())

## Create Embeddings

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-roberta-large")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-roberta-large").to(device).eval()

In [None]:
amazon_embeddings = torch.load("notebooks/dynasent_analysis/amazon_embeddings.pt")
train_set["embedding"] = amazon_embeddings.tolist()
train_set.head()

In [None]:
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    return model(**tokens)["pooler_output"].detach().cpu().numpy()

test_set["embedding"] = test_set["text"].progress_apply(get_embedding)
test_set.head()

In [None]:
test_set_embeddings = torch.Tensor(np.stack(test_set["embedding"])).squeeze(1)
display(test_set_embeddings.shape)
torch.save(test_set_embeddings, "notebooks/dynasent_analysis/amazon_validation_embeddings.pt")

In [None]:
labels = train_set["label"].unique()
vector_stores = {}
centroids = {}
centroid_examples = {}
k = 10
d = 1024

for label in labels:
    label_instances = train_set[train_set["label"] == label]
    label_embeddings = np.stack(label_instances["embedding"].to_numpy()).astype(np.float32)
    
    faiss.normalize_L2(label_embeddings)
    vector_stores[label] = faiss.IndexFlatIP(d)
    vector_stores[label].add(label_embeddings)
    centroids[label] = label_embeddings.mean(axis=0)
    
    cosine_sims, centroid_example_indices = vector_stores[label].search(centroids[label].reshape(1, -1), k)
    centroid_examples[label] = []
    for index in centroid_example_indices[0]:
        centroid_examples[label].append(label_instances.iloc[index]["text"])

centroid_examples

In [None]:
new_train_set_records = []
for inde, row in tqdm(train_set.iterrows(), total=len(train_set)):
    current_label = row["label"]
    current_text = row["text"]
    for example in centroid_examples[current_label]:
        new_train_set_records.append({"text": current_text, "label": example, "class": current_label})

rewrite_train_set = pd.DataFrame(new_train_set_records).sample(frac=1).reset_index(drop=True)
rewrite_train_set.to_csv("datasets/corrupted/boss_sentiment_train.csv", index=False)
display(rewrite_train_set)

new_test_set_records = []
for inde, row in tqdm(test_set.iterrows(), total=len(test_set)):
    current_label = row["label"]
    current_text = row["text"]
    for example in centroid_examples[current_label]:
        new_test_set_records.append({"text": current_text, "label": example, "class": current_label})

rewrite_test_set = pd.DataFrame(new_test_set_records).sample(frac=1).reset_index(drop=True)
rewrite_test_set.to_csv("datasets/corrupted/boss_sentiment_test.csv", index=False)
display(rewrite_test_set)

## Dataset with Augmentations

In [None]:
paraphrase_tokenizer, paraphrase_model = get_model_objects("humarin/chatgpt_paraphraser_on_T5_base", num_labels=-1)

In [74]:
example_text = "I use this every day I would recommend this for anyone who has special needs with thinning hair, it has made a huge difference in my daily life."
get_paraphrase_augmentations(example_text,
                             paraphrase_tokenizer,
                             paraphrase_model,
                             paraphrase_model.device,
                             num_return_sequences=4,
                             temperature=0.3)

['This product is essential for me to use every day, and I highly recommend it to anyone with special needs who has thin hair.',
 'My daily routine involves this product and I highly recommend it to individuals with special needs who have thinning hair.',
 "It's a lifesaver for anyone with special needs and hair loss, as it has transformed my daily routine.",
 "I use this product every day. It's a lifesaver for people with special needs and hair loss."]

In [76]:
new_train_set_records = []
for _, row in tqdm(train_set.iterrows(), total=len(train_set)):
    current_label = row["label"]
    current_text = row["text"]
    augmentations = get_paraphrase_augmentations(current_text,
                             paraphrase_tokenizer,
                             paraphrase_model,
                             paraphrase_model.device,
                             num_return_sequences=4,
                             temperature=0.3)
    
    for example in centroid_examples[current_label]:
        for text_input in [current_text] + augmentations:
            new_train_set_records.append({"text": text_input, "label": example, "class": current_label})

rewrite_train_set = pd.DataFrame(new_train_set_records).sample(frac=1).reset_index(drop=True)
rewrite_train_set.to_csv("datasets/corruped/boss_sentiment_augmented_train.csv", index=False)
display(rewrite_train_set)

100%|██████████| 29999/29999 [7:47:40<00:00,  1.07it/s]  


Unnamed: 0,text,label,class
0,I hadn't brewed tea in months and my son had n...,Works perfectly for what I need it for. Feels ...,1
1,Ended within a year.,EDIT: I'm leaving my original review below but...,0
2,"I am bursting with food cravings, and this is ...",I like this product.,1
3,The color change on my entire body was quite s...,"The ends are fairly sharp, but some were dulle...",2
4,Our mobility is compromised by the screws fall...,"I bought this product because of the ratings, ...",0
...,...,...,...
1499945,"It's not entirely overstated, but it'll be a b...",My expectations were probably a bit high since...,2
1499946,I have loved Steve since the mid 80's. I would...,"I am not terribly impressed with this case, gl...",2
1499947,The quality of things is not good.,"If I could have 0 stars I would, this was comp...",0
1499948,The availability of multiple size options in C...,Great product and quality. I recommend this pr...,1


In [77]:
new_test_set_records = []
for _, row in tqdm(test_set.iterrows(), total=len(test_set)):
    current_label = row["label"]
    current_text = row["text"]
    augmentations = get_paraphrase_augmentations(current_text,
                             paraphrase_tokenizer,
                             paraphrase_model,
                             paraphrase_model.device,
                             num_return_sequences=4,
                             temperature=0.3)
    
    for example in centroid_examples[current_label]:
        for text_input in [current_text] + augmentations:
            new_test_set_records.append({"text": text_input, "label": example, "class": current_label})

rewrite_test_set = pd.DataFrame(new_test_set_records).sample(frac=1).reset_index(drop=True)
rewrite_test_set.to_csv("datasets/corruped/boss_sentiment_augmented_test.csv", index=False)
display(rewrite_test_set)

100%|██████████| 38904/38904 [8:47:49<00:00,  1.23it/s]   


Unnamed: 0,text,label,class
0,Good quality,Works perfectly for what I need it for. Feels ...,1
1,It's a wonderful invention.,Really a very nice product. Works great and so...,1
2,"Despite trying out other filament brands, I'll...",Really a very nice product. Works great and so...,1
3,Utilizing H & R Block (Tax Cut) for over 10 ye...,"Excellent product, I love it.",1
4,Staying steady is a great value. The price is ...,Works perfectly for what I need it for. Feels ...,1
...,...,...,...
1945195,Similar to my old experience of buying from be...,Really a very nice product. Works great and so...,1
1945196,Comprehending my expectations.,I like this product.,1
1945197,I acquired this item for a project that is rel...,very nice & I like it for everything I used it...,1
1945198,"The matte finish necessitates clear coats, inc...",Nothing special. Just like every other strap o...,2
