In [2]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, LlamaTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForQuestionAnswering


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_model_objects(model_name, num_labels, training=False):
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    is_seq2seq_lm = model_config.architectures[0].endswith("ForConditionalGeneration")
    is_qa_model = model_config.architectures[0].endswith("ForQuestionAnswering")
    is_llm = model_config.architectures[0].endswith("ForCausalLM")
    is_llama_based_model = is_llm and "llama" in model_name or "vicuna" in model_name

    tokenizer = LlamaTokenizer.from_pretrained(model_name) if is_llama_based_model else AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<s>" if tokenizer.pad_token in [None, ""] and str(tokenizer.eos_token) in [None, ""] else tokenizer.eos_token

    model = None
    device = "cuda" if torch.cuda.is_available() else "cpu"
    numerical_precision = torch.float32 if training else torch.float16
    if is_llm:
        num_billions = [int(entry[:-1]) for entry in model_name.split("-") if entry[0].isdigit() and entry.lower().endswith("b")]
        load_in_8bit = (len(num_billions) > 0 and num_billions[0] > 7) or training
        if load_in_8bit:
            print("Loading in 8-bit mode since the model has more than 7B parameters or we are training.")
            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, load_in_8bit=True, llm_int8_threshold=0, device_map="auto").eval()
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    elif is_qa_model:
        model = AutoModelForQuestionAnswering.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    elif is_seq2seq_lm:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=num_labels).eval().to(device)
    return tokenizer, model

paraphrase_tokenizer, paraphrase_model = get_model_objects("humarin/chatgpt_paraphraser_on_T5_base", num_labels=-1)

In [4]:
def get_paraphrase_augmentations(
    question,
    paraphrase_tokenizer,
    paraphrase_model,
    device,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=3,
    repetition_penalty=100.0,
    diversity_penalty=100.0,
    no_repeat_ngram_size=10,
    temperature=0.7,
    max_length=128,
):
    input_ids = paraphrase_tokenizer(
        f"paraphrase: {question}",
        return_tensors="pt",
        padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = paraphrase_model.generate(
        input_ids,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams,
        num_beam_groups=num_beam_groups,
        max_length=max_length,
        diversity_penalty=diversity_penalty,
    )

    res = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

example_text = "demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . "
example_augs = get_paraphrase_augmentations(example_text, paraphrase_tokenizer, paraphrase_model, paraphrase_model.device)
example_augs

['Demonstrates that the director of Hollywood blockbuster hits like Patriot Games can still produce a small, personal film with an emotional element.',
 'Shows that the director of Hollywood blockbuster hits like Patriot Games can still produce a small, personal film with an emotional element.',
 'It demonstrates that the director of Hollywood blockbuster hits like Patriot Games can still make a small, personal film with an emotional depth.']

In [5]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embeddings(tokenizer, model, left_text, right_text):
    encoded_input = tokenizer(left_text, right_text, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output, encoded_input['attention_mask'])


def get_cosine_similarity(sentence_tokenizer, sentence_model, left_text, right_text):
    left_embedding = get_embeddings(sentence_tokenizer, sentence_model, left_text, right_text)
    right_embedding = get_embeddings(sentence_tokenizer, sentence_model, right_text, left_text)
    return F.cosine_similarity(left_embedding, right_embedding).item()


hf_model_path = "sentence-transformers/all-mpnet-base-v2"
sentence_tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
sentence_model = AutoModel.from_pretrained(hf_model_path)

In [6]:
for aug in example_augs:
    print(get_cosine_similarity(sentence_tokenizer, sentence_model, aug, example_text))

0.9837890863418579
0.987225353717804
0.9868114590644836


In [7]:
import nlpaug.augmenter.word as naw
word_augmenter = naw.ContextualWordEmbsAug(device="cuda", action="insert")
random_deleter = naw.RandomWordAug(action="delete", aug_p=0.30)

sub_augs = [word_augmenter.augment(random_deleter.augment(aug)) for aug in example_augs]
sub_augs = [aug[0] for aug in sub_augs if len(aug) > 0]
sub_augs


['demonstrates first that the of hollywood blockbuster series hits which patriot still to produce remain small, personal with emotional.',
 'also shows that just the director of transforming blockbuster hits into games can still make small, personal film with.',
 'demonstrates films that the creative director of many hollywood blockbuster hits patriot games can and still make, and personal film helps with.']

In [8]:
cosines = [get_cosine_similarity(sentence_tokenizer, sentence_model, aug, example_text) for aug in sub_augs]
aug_cosine_pairs = list(zip(sub_augs, cosines))
display(aug_cosine_pairs)

[('demonstrates first that the of hollywood blockbuster series hits which patriot still to produce remain small, personal with emotional.',
  0.9846742749214172),
 ('also shows that just the director of transforming blockbuster hits into games can still make small, personal film with.',
  0.9737756848335266),
 ('demonstrates films that the creative director of many hollywood blockbuster hits patriot games can and still make, and personal film helps with.',
  0.9829103946685791)]

In [9]:
max(enumerate(aug_cosine_pairs), key=lambda x: x[1])

(0,
 ('demonstrates first that the of hollywood blockbuster series hits which patriot still to produce remain small, personal with emotional.',
  0.9846742749214172))

## Create Corrupted IMDB

In [10]:
from datasets import load_dataset
from tqdm import tqdm
tqdm.pandas()

imdb_train_split = load_dataset("imdb", split="train").to_pandas()
imdb_train_split.rename(columns={"label": "class","text": "label"}, inplace=True)
imdb_train_split

Found cached dataset imdb (/home/kyle/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Unnamed: 0,label,class
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [13]:
def get_augmentation(current_text):
    paraphrases = get_paraphrase_augmentations(current_text, paraphrase_tokenizer, paraphrase_model, paraphrase_model.device)
    corrupted_paraphrases = [word_augmenter.augment(random_deleter.augment(aug)) for aug in example_augs]
    corrupted_paraphrases = [aug for aug in sub_augs if len(aug) > 0]
    corrupted_cosines = [get_cosine_similarity(sentence_tokenizer, sentence_model, current_text, aug) for aug in corrupted_paraphrases]
    corrupted_aug_cosine_pairs = list(zip(corrupted_paraphrases, corrupted_cosines))
    most_corrupted = max(enumerate(corrupted_aug_cosine_pairs), key=lambda x: x[1])[1]
    return most_corrupted


sample = imdb_train_split.sample(1000)
sample["text"] = sample.progress_apply(lambda row: get_augmentation(row["label"]), axis=1)
sample

100%|██████████| 1000/1000 [35:31<00:00,  2.13s/it]


Unnamed: 0,label,class,text
19762,Very good dramatic comedy about a playwright t...,1,(demonstrates first that the of hollywood bloc...
3902,"Forget Plan 9, this is the ultimate fiasco, a ...",0,(demonstrates first that the of hollywood bloc...
21922,Some movies you just know you're going to love...,1,(demonstrates first that the of hollywood bloc...
2869,The past creeps up on a rehab-addict when he r...,0,(demonstrates first that the of hollywood bloc...
9288,To the small minority seen here praising this ...,0,(demonstrates first that the of hollywood bloc...
...,...,...,...
13089,This is mostly a story about the growing relat...,1,(demonstrates first that the of hollywood bloc...
4560,This film has nothing whatever to do with the ...,0,(demonstrates first that the of hollywood bloc...
20587,The Dentist starts on the morning of Dr. Alan ...,1,(demonstrates first that the of hollywood bloc...
16399,I had no idea that Mr. Izzard was so damn funn...,1,(demonstrates first that the of hollywood bloc...
