In [1]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, LlamaTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_model_objects(model_name, num_labels, training=False):
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    is_seq2seq_lm = model_config.architectures[0].endswith("ForConditionalGeneration")
    is_qa_model = model_config.architectures[0].endswith("ForQuestionAnswering")
    is_llm = model_config.architectures[0].endswith("ForCausalLM")
    is_llama_based_model = is_llm and "llama" in model_name or "vicuna" in model_name

    tokenizer = LlamaTokenizer.from_pretrained(model_name) if is_llama_based_model else AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<s>" if tokenizer.pad_token in [None, ""] and str(tokenizer.eos_token) in [None, ""] else tokenizer.eos_token

    model = None
    device = "cuda" if torch.cuda.is_available() else "cpu"
    numerical_precision = torch.float32 if training else torch.float16
    if is_llm:
        num_billions = [int(entry[:-1]) for entry in model_name.split("-") if entry[0].isdigit() and entry.lower().endswith("b")]
        load_in_8bit = (len(num_billions) > 0 and num_billions[0] > 7) or training
        if load_in_8bit:
            print("Loading in 8-bit mode since the model has more than 7B parameters or we are training.")
            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, load_in_8bit=True, llm_int8_threshold=0, device_map="auto").eval()
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    elif is_qa_model:
        model = AutoModelForQuestionAnswering.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    elif is_seq2seq_lm:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=numerical_precision).eval().to(device)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=num_labels).eval().to(device)
    return tokenizer, model

paraphrase_tokenizer, paraphrase_model = get_model_objects("humarin/chatgpt_paraphraser_on_T5_base", num_labels=-1)

In [None]:
def get_paraphrase_augmentations(
    question,
    paraphrase_tokenizer,
    paraphrase_model,
    device,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=3,
    repetition_penalty=100.0,
    diversity_penalty=100.0,
    no_repeat_ngram_size=10,
    temperature=0.7,
    max_length=128,
):
    input_ids = paraphrase_tokenizer(
        f"paraphrase: {question}",
        return_tensors="pt",
        padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = paraphrase_model.generate(
        input_ids,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams,
        num_beam_groups=num_beam_groups,
        max_length=max_length,
        diversity_penalty=diversity_penalty,
    )

    res = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

example_text = "demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . "
example_augs = get_paraphrase_augmentations(example_text, paraphrase_tokenizer, paraphrase_model, paraphrase_model.device)
example_augs

In [None]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embeddings(tokenizer, model, left_text, right_text):
    encoded_input = tokenizer(left_text, right_text, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output, encoded_input['attention_mask'])


def get_cosine_similarity(sentence_tokenizer, sentence_model, left_text, right_text):
    left_embedding = get_embeddings(sentence_tokenizer, sentence_model, left_text, right_text)
    right_embedding = get_embeddings(sentence_tokenizer, sentence_model, right_text, left_text)
    return F.cosine_similarity(left_embedding, right_embedding).item()


hf_model_path = "sentence-transformers/all-mpnet-base-v2"
sentence_tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
sentence_model = AutoModel.from_pretrained(hf_model_path)

In [None]:
for aug in example_augs:
    print(get_cosine_similarity(sentence_tokenizer, sentence_model, aug, example_text))

In [None]:
import nlpaug.augmenter.word as naw
word_augmenter = naw.ContextualWordEmbsAug(device="cuda", action="insert")
random_deleter = naw.RandomWordAug(action="delete", aug_p=0.30)

sub_augs = [word_augmenter.augment(random_deleter.augment(aug)) for aug in example_augs]
sub_augs = [aug[0] for aug in sub_augs if len(aug) > 0]
sub_augs


In [None]:
cosines = [get_cosine_similarity(sentence_tokenizer, sentence_model, aug, example_text) for aug in sub_augs]
aug_cosine_pairs = list(zip(sub_augs, cosines))
display(aug_cosine_pairs)

In [None]:
max(enumerate(aug_cosine_pairs), key=lambda x: x[1])

## Create Corrupted IMDB

In [None]:
from datasets import load_dataset
from tqdm import tqdm
tqdm.pandas()

imdb_train_split = load_dataset("imdb", split="train").to_pandas()
imdb_train_split.rename(columns={"label": "class","text": "label"}, inplace=True)
imdb_train_split

In [None]:
def get_augmentation(current_text):
    paraphrases = get_paraphrase_augmentations(current_text, paraphrase_tokenizer, paraphrase_model, paraphrase_model.device)
    corrupted_paraphrases = [word_augmenter.augment(random_deleter.augment(aug)) for aug in example_augs]
    corrupted_paraphrases = [aug for aug in sub_augs if len(aug) > 0]
    corrupted_cosines = [get_cosine_similarity(sentence_tokenizer, sentence_model, current_text, aug) for aug in corrupted_paraphrases]
    corrupted_aug_cosine_pairs = list(zip(corrupted_paraphrases, corrupted_cosines))
    most_corrupted = max(enumerate(corrupted_aug_cosine_pairs), key=lambda x: x[1])[1]
    return most_corrupted


sample = imdb_train_split.sample(1000)
sample["text"] = sample.progress_apply(lambda row: get_augmentation(row["label"]), axis=1)
sample

# Evaluate Model

In [9]:
rewriter_tokenizer, rewriter_model = get_model_objects("/home/kyle/repos/In-Context-Domain-Transfer-Improves-Out-of-Domain-Robustness/trained_models/training_1691034784_datasets_corruped_boss_sentiment20000_selected_models_boss_sentiment_bert.csv_t5-large/best_F1=0.6355934739112854", 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rewriter_model = rewriter_model.to(device)

In [16]:
example_text = "while initially only the very three - fold book, one is interesting to readers like menage. the book are already back to production."
# target = While this is not the best threefold it certainly is hot! The perfect story for first-time Menage readers. The second book is already waiting for me.
tokenized_input = rewriter_tokenizer.encode(example_text, return_tensors="pt").to(rewriter_model.device)
outputs = rewriter_model.generate(
                tokenized_input,
                # do_sample=True,
                # temperature=0.1,
                # max_new_tokens=1,
                # early_stopping=True,
                return_dict_in_generate=True,
            )

outputs
decoded_outputs = rewriter_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
decoded_outputs

'The book is very three fold and it is very interesting to read.'