## Prepare SNLI and Contrast Data

In [21]:
import datasets
dataset = datasets.load_dataset('snli')
dataset


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

The original training dataset size around 550,152
The original validation dataset size around 10,000

In [31]:
import pandas as pd
data=pd.read_csv('LIT_auto-gen-contrast-set/generate/snli_aug/train.tsv',sep='\t')
data2 =  data[data['captionID']!='original' ].reset_index()
data3 = data2[ ['sentence1','sentence2','gold_label','captionID'] ]
labelMap = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
data4 = data3.replace({"gold_label": labelMap})
contrast_train = data4.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis', 'gold_label': 'label', 'captionID': 'type'})
contrast_train

Unnamed: 0,premise,hypothesis,label,type
0,It is a man who is lying on an air mattress.,It is the man which is asleep.,1,*;it cleft: ARG1;it cleft: ARG1
1,Her wind instrument is played by a woman along...,A band is belonged to by the woman.,1,*;passive: ARG2;passive: ARG2
2,It is a woman who plays her wind instrument al...,It is the woman who belongs to a band.,1,*;it cleft: ARG1;it cleft: ARG1
3,A girl with a ponytail in a black soccer unifo...,A player will get ready to kick the ball.,1,entailment;past simple;future simple
4,A girl with a ponytail in a black soccer unifo...,A player gets ready to kick the ball.,1,entailment;may;original
...,...,...,...,...
189109,Children may play in a water fountain.,The children may be wet.,1,entailment;may;may
189110,It is children who play in a water fountain.,It is the children who are wet.,0,*;it cleft: ARG1;it cleft: ARG1
189111,These may be very well groomed animals.,The animal's fur looked well kept,1,entailment;may;original
189112,These may be very well groomed animals.,The animal's fur may look well-kept.,1,entailment;may;may


In [32]:
output_path = "contrast/train.jsonl"

with open(output_path, "w") as f:
    f.write(contrast_train.to_json(orient='records', lines=True))

In [33]:
data=pd.read_csv('LIT_auto-gen-contrast-set/generate/snli_aug/validation.tsv',sep='\t')
data2 =  data[data['captionID']!='original' ].reset_index()
data3 = data2[ ['sentence1','sentence2','gold_label','captionID'] ]
data4 = data3.replace({"gold_label": labelMap})
contrast_eval = data4.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis', 'gold_label': 'label', 'captionID': 'type'})
contrast_eval

Unnamed: 0,premise,hypothesis,label,type
0,As people walk by it is a woman who is eating ...,It is a woman who is eating lunch while on her...,1,*;it cleft: ARG1;it cleft: ARG1
1,It is the pitcher wearing black who heaves the...,It is the Yankees who are at practice.,1,*;it cleft: ARG1;it cleft: ARG1
2,A group of Chinese people may be relaxing in t...,A group of Americans are in water.,1,contradiction;may;original
3,A group of Chinese people may be relaxing in t...,A group of Americans may be in water.,1,contradiction;may;may
4,It is a group of Chinese people who is relaxin...,It is a group of Americans which is in water.,2,*;it cleft: ARG1;it cleft: ARG1
...,...,...,...,...
4199,A woman may observe an antique car rusted.,A woman may look at an old car.,1,entailment;may;may
4200,A rusted antique car is observed by a woman.,An old car is looked at by a woman.,0,*;passive: ARG2;passive: ARG2
4201,It is a woman who observes an antique car rusted.,It is a woman who looks at an old car.,0,*;it cleft: ARG1;it cleft: ARG1
4202,It is a woman who observed an antique car rusted.,It is a woman who will look at an old car.,1,entailment;past simple+it cleft: ARG1;future s...


In [34]:
output_path = "contrast/validation.jsonl"

with open(output_path, "w") as f:
    f.write(contrast_eval.to_json(orient='records', lines=True))

The contrast training dataset size around 189,114
The contrast validation dataset size around 4204

## Running Evaludation on SNLI and Contrast Validation Dataset on Pretrained Model

In [36]:
# python3 run.py --do_eval --task nli --dataset snli --model ./trained_model/checkpoint-206000   --output_dir ./eval_output/snli/
# python3 run.py --do_eval --task nli --dataset contrast/validation.jsonl   --model ./trained_model/checkpoint-206000   --output_dir ./eval_output/contrast/

Evaluation results: <br><br>
SNLI: <br>
{'eval_loss': 0.3796271085739136, 'eval_accuracy': 0.8947368264198303, 'eval_runtime': 15.2123, 'eval_samples_per_second': 646.977, 'eval_steps_per_second': 80.921} <br><br>
Contrast: <br>
{'eval_loss': 2.7356245517730713, 'eval_accuracy': 0.454091340303421, 'eval_runtime': 7.0502, 'eval_samples_per_second': 596.296, 'eval_steps_per_second': 74.608}

### TODO: need to categorize error types and visualize them

In [None]:
# Much work needs to be done here

## Finetune on Contrast Training Dataset

In [None]:
# python3 run.py --do_train --task nli --dataset contrast/train.jsonl --output_dir ./trained_model/  --model trained_model/checkpoint-206000

Evaluation results: <br><br>
SNLI: <br>
{'eval_loss': 0.5675817131996155, 'eval_accuracy': 0.8639504313468933, 'eval_runtime': 15.3651, 'eval_samples_per_second': 640.543, 'eval_steps_per_second': 80.117} <br><br>
Contrast: <br>
{'eval_loss': 0.24433456361293793, 'eval_accuracy': 0.940057098865509, 'eval_runtime': 6.8735, 'eval_samples_per_second': 611.623, 'eval_steps_per_second': 76.526}

## Additional Work:
- Generate Adversarial Dataset and Finetune (risk)
- Blend Original Training Dataset, Contrast Dataset, Adversarial Dataset, and Finetune
