##  Generate training and validation Adversarial Dataset

In [None]:
! pip install textattack

In [None]:
import textattack

def apply_attack_to_premise(dataset, attack):
    transformed_premise_data = []
    for original_premise, original_hypothesis, label in dataset:
        attack_results = attack.attack(original_premise, label)

        if hasattr(attack_results, '__iter__'):
            for result in attack_results:
                if isinstance(result, textattack.attack_results.SuccessfulAttackResult):
                    transformed_premise_data.append((original_premise, result.perturbed_text(), original_hypothesis, label))
        else:
            if isinstance(attack_results, textattack.attack_results.SuccessfulAttackResult):
                transformed_premise_data.append((original_premise, attack_results.perturbed_text(), original_hypothesis, label))
    return transformed_premise_data

def apply_attack_to_hypothesis(dataset, attack):
    transformed_hypothesis_data = []
    for original_premise, original_hypothesis, label in dataset:
        attack_results = attack.attack(original_hypothesis, label)

        if hasattr(attack_results, '__iter__'):
            for result in attack_results:
                if isinstance(result, textattack.attack_results.SuccessfulAttackResult):
                    transformed_hypothesis_data.append((original_premise, original_hypothesis, result.perturbed_text(), label))
        else:
            if isinstance(attack_results, textattack.attack_results.SuccessfulAttackResult):
                transformed_hypothesis_data.append((original_premise, original_hypothesis, attack_results.perturbed_text(), label))
    return transformed_hypothesis_data


In [None]:
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder
from textattack.constraints.grammaticality import PartOfSpeech
from textattack.transformations import WordSwapEmbedding
from textattack.search_methods import GreedySearch
from textattack.goal_functions import UntargetedClassification
from textattack import Attack

# customized receipe to our specific problem
class CustomRecipe(Attack):
    def __init__(self, model):
        transformation = WordSwapEmbedding(max_candidates=10)
        constraints = [
            RepeatModification(),
            StopwordModification(),
            UniversalSentenceEncoder(threshold=0.8),
            PartOfSpeech()
        ]

        search_method = GreedySearch()
        goal_function = UntargetedClassification(model)
        super().__init__(goal_function, constraints, transformation, search_method)


In [None]:
from transformers import AutoModel, AutoTokenizer
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.datasets import Dataset
from transformers import ElectraForSequenceClassification

model_path = '/content/NLP_FP/nlp_fp/trained_model/checkpoint/checkpoint-206000'
model = ElectraForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


model.eval()

# Wrap the model for TextAttack
model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Initialize your TextAttack custom recipe with the model wrapper
attack = CustomRecipe(model_wrapper)

class CustomTextAttackDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
import json
import subprocess
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

num_examples = 1000
total_examples = 10000


for i in range(0, total_examples, num_examples):

    limited_combined_data = [(row['premise'], row['hypothesis'], row['label']) 
                             for _, row in df_ad.iloc[i:i+num_examples].iterrows()]

    custom_dataset = CustomTextAttackDataset(limited_combined_data)
    transformed_premise_data = apply_attack_to_premise(custom_dataset, attack)
    structured_premise_data = []

    for orig_premise, adv_premise, orig_hypothesis, label in transformed_premise_data:
 
        structured_premise_data.append({
            'premise': adv_premise,
            'hypothesis': orig_hypothesis,
            'label': label
        })
  
    file_name = f'transformed_premise_data_{i // num_examples + 1}.json'

    with open(file_name, 'w') as outfile:
        json.dump(structured_premise_data, outfile, indent=4)

    subprocess.run(["git", "add", file_name])
    commit_message = f"Add processed chunk {i // num_examples + 1}"
    subprocess.run(["git", "commit", "-m", commit_message])
    print(f"Processed, saved, and committed chunk {i // num_examples + 1}")

subprocess.run(["git", "config", "--global", "user.email", "joliefang@utexas.edu"])
subprocess.run(["git", "config", "--global", "user.name", "JoFangUTA"])
subprocess.run(["git", "push", "origin", "main"])

print("All chunks processed, saved, committed, and pushed.")

In [None]:
import json
import os
import logging


logging.basicConfig(level=logging.INFO)

num_files = 50
directory = "/content/NLP_FP/nlp_fp/"
file_names = [f'{directory}transformed_hypothesis_data_{i}.json' for i in range(1, num_files + 1)]
combined_data = []
missing_files = []

for file_name in file_names:
    if os.path.exists(file_name):
        try:
            with open(file_name, 'r') as file:
                data = json.load(file)
                combined_data.extend(data)
        except json.JSONDecodeError:
            logging.error(f"Error decoding JSON from {file_name}")
    else:
        missing_files.append(file_name)

output_file = f'{directory}new_combined_transformed_hypothesis_data.json'
try:
    with open(output_file, 'w') as outfile:
        json.dump(combined_data, outfile, indent=4)
    logging.info(f"All files have been combined into '{output_file}'")
except Exception as e:
    logging.error(f"Error writing to combined file: {e}")

if missing_files:
    logging.warning(f"The following files were not found: {', '.join(missing_files)}")

In [3]:
# split dataset into training and validation
import json
from sklearn.model_selection import train_test_split

data_path = '/Users/joliefang/Downloads/adver_Data/cleaned_addata/cleaned_final_data.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Split the data into training and validation sets
train_data, validation_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the training data
with open('/Users/joliefang/Downloads/adver_Data/cleaned_addata/train_data.json', 'w') as file:
    json.dump(train_data, file)

# Save the validation data
with open('/Users/joliefang/Downloads/adver_Data/cleaned_addata/validation_data.json', 'w') as file:
    json.dump(validation_data, file)

print(f"Total records: {len(data)}")
print(f"Training records: {len(train_data)}")
print(f"Validation records: {len(validation_data)}")

Total records: 16482
Training records: 13185
Validation records: 3297


In [3]:
# python3 run.py --do_eval --task nli --dataset snli --model ./trained_model/checkpoint-206000   --output_dir ./eval_output/snli/
!python3 /Users/joliefang/Downloads/adver_Data/fp_run/run.py --do_eval --task nli --dataset /Users/joliefang/Downloads/adver_Data/adversarial_data/validation_data.json   --model /Users/joliefang/Downloads/checkpoint-206000  --output_dir /Users/joliefang/Downloads/adver_Data/adversarial_data/eval_output

2023-12-03 13:29:44.948741: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading data files: 100%|███████████████████| 1/1 [00:00<00:00, 8338.58it/s]
Extracting data files: 100%|█████████████████████| 1/1 [00:00<00:00, 194.41it/s]
Generating train split: 3297 examples [00:00, 101474.35 examples/s]
Preprocessing data... (this takes a little bit, should only happen once per dataset)
Map (num_proc=2): 100%|████████████| 3297/3297 [00:00<00:00, 6022.77 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████

{"eval_loss": 0.8307645916938782, "eval_accuracy": 0.7992113828659058, "eval_runtime": 129.7323, "eval_samples_per_second": 25.414, "eval_steps_per_second": 3.183}

## Finetune on Adversarial Dataset

In [2]:
!python3 /Users/joliefang/Downloads/adver_Data/fp_run/run.py --do_train --task nli --dataset ./train_data.json --output_dir /Users/joliefang/Downloads/adver_Data/output  --model /Users/joliefang/Downloads/checkpoint-206000

2023-12-03 11:16:03.919620: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading data files: 100%|███████████████████| 1/1 [00:00<00:00, 5433.04it/s]
Extracting data files: 100%|█████████████████████| 1/1 [00:00<00:00, 155.13it/s]
Generating train split: 13185 examples [00:00, 111314.65 examples/s]
Preprocessing data... (this takes a little bit, should only happen once per dataset)
Map (num_proc=2): 100%|██████████| 13185/13185 [00:03<00:00, 4343.04 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
  0%|                                                  | 0/4947 [00:00<?, ?it/s]You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text fo

In [4]:
!python3 /Users/joliefang/Downloads/adver_Data/fp_run/run.py --do_eval --task nli --dataset /Users/joliefang/Downloads/adver_Data/adversarial_data/validation_data.json   --model /Users/joliefang/Downloads/adver_Data/output/checkpoint-4500 --output_dir /Users/joliefang/Downloads/adver_Data/finetune_ad_eval_output

2023-12-03 13:36:03.871163: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Preprocessing data... (this takes a little bit, should only happen once per dataset)
Map (num_proc=2): 100%|████████████| 3297/3297 [00:00<00:00, 5451.67 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████| 413/413 [02:10<00:00,  3.17it/s]
Evaluation results:
{'eval_loss': 0.46119940280914307, 'eval_accuracy': 0.9126478433609009, 'eval_runtime': 131.047, 'eval_samples_per_second': 25.159, 'eval_steps_per

{"eval_loss": 0.46119940280914307, "eval_accuracy": 0.9126478433609009, "eval_runtime": 131.047, "eval_samples_per_second": 25.159, "eval_steps_per_second": 3.152}

In [6]:
!python3 /Users/joliefang/Downloads/adver_Data/fp_run/run.py --do_eval --task nli --dataset snli  --model /Users/joliefang/Downloads/adver_Data/output/checkpoint-4500 --output_dir /Users/joliefang/Downloads/adver_Data/finetune_og_snli_test

2023-12-03 13:43:00.310875: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Preprocessing data... (this takes a little bit, should only happen once per dataset)
Map (num_proc=2): 100%|████████████| 9842/9842 [00:01<00:00, 5151.32 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|███████████████████████████████████████| 1231/1231 [06:16<00:00,  3.27it/s]
Evaluation results:
{'eval_loss': 0.8638477921485901, 'eval_accuracy': 0.8518593907356262, 'eval_runtime': 376.954, 'eval_samples_per_second': 26.109, 'eval_steps_per_

In [9]:
from tabulate import tabulate
# create data
data = [["Pretrain", 89.47, 45.41], 
        ["Contrast-Finetune", 86.40, 94.00]]
  
#define header names
col_names = ["Model", "accuracy on SNLI", "accuracy on SNLI-contrast"]
  
#display table
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

╒═══════════════════╤════════════════════╤═════════════════════════════╕
│ Model             │   accuracy on SNLI │   accuracy on SNLI-contrast │
╞═══════════════════╪════════════════════╪═════════════════════════════╡
│ Pretrain          │              89.47 │                       45.41 │
├───────────────────┼────────────────────┼─────────────────────────────┤
│ Contrast-Finetune │              86.4  │                       94    │
╘═══════════════════╧════════════════════╧═════════════════════════════╛


In [10]:
!python3 /Users/joliefang/Downloads/adver_Data/fp_run/run.py --do_eval --task nli --dataset /Users/joliefang/Downloads/adver_Data/adversarial_data/contrast_validation.jsonl  --model /Users/joliefang/Downloads/adver_Data/output/checkpoint-4500 --output_dir /Users/joliefang/Downloads/adver_Data/contrast_eval

2023-12-03 21:53:17.039846: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading data files: 100%|███████████████████| 1/1 [00:00<00:00, 7570.95it/s]
Extracting data files: 100%|█████████████████████| 1/1 [00:00<00:00, 224.46it/s]
Generating train split: 4204 examples [00:00, 150019.60 examples/s]
Preprocessing data... (this takes a little bit, should only happen once per dataset)
Map (num_proc=2): 100%|████████████| 4204/4204 [00:01<00:00, 3552.97 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████