## Balance Datasets Between Original Labels

Sample **by MNLI index** from list of minimal pair edits of mask-filled sentence pairs until classes are balanced.

With replacement **by MNLI index** and without replacement **by Generated Sentence Pairs**

In [67]:
files = !ls MNLI/new
files

['5000_fine-tuned_data-slices_gold-label_beam_mnli_cond_pairs_tagged.csv',
 '5000_fine-tuned_data-slices_gold-label_diverse-beam_mnli_cond_pairs_tagged.csv',
 '5000_fine-tuned_data-slices_model-label_beam_mnli_cond_pairs_tagged.csv',
 '5000_fine-tuned_data-slices_model-label_diverse-beam_mnli_cond_pairs_tagged.csv',
 'sample_5000_fine-tuned_data-slices_gold-label_beam_mnli_cond_pairs_tagged.csv',
 'sample_5000_fine-tuned_data-slices_gold-label_diverse-beam_mnli_cond_pairs_tagged.csv',
 'sample_5000_fine-tuned_data-slices_model-label_beam_mnli_cond_pairs_tagged.csv',
 'tagged_5000_fine-tuned_data-slices_gold-label_beam_mnli_cond_pairs_tagged.csv',
 'tagged_5000_fine-tuned_data-slices_gold-label_diverse-beam_mnli_cond_pairs_tagged.csv',
 'tagged_5000_fine-tuned_data-slices_model-label_beam_mnli_cond_pairs_tagged.csv']

In [68]:
# read data
import pandas as pd
import os
from ast import literal_eval as make_tuple
from collections import Counter
import matplotlib.pyplot as plt
import glob

# combine separate generations quickly
# extension = 'csv'
# all_filenames = [i for i in glob.glob('../'+ '*.{}'.format(extension))]
# combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])

# df = combined_csv

file_num = 3

df = pd.read_csv('./MNLI/new/' + files[file_num])
df.shape

(2898, 25)

In [69]:
# delete mask-fills where there is no new token
df = df[df['mask-filled'] != df['hypothesis']]

# cut examples that don't change tokens 
# same_token = [(make_tuple(token_pairs)[0] == make_tuple(token_pairs)[1]) for token_pairs in df['token_changes']]
# df['same-token'] = same_token
# df = df[df['same-token'] == False]

# cut the examples that reached threshold
df = df[df['depth'] <= 9]

# cut duplicate mask-filled sentences. only need one prem+hypo+hypo_mask for each pair.
df = df.drop_duplicates(subset='mask-filled', keep="last") 
df.shape

(2898, 25)

In [70]:
# # for each data instance (original prem+hypo pair) gather top-k examples by similarity 
# topk = 5
# print(len(df['line-num'].unique()))
# df = df.groupby('line-num', as_index=False).apply(lambda x: x.nlargest(topk, 'Bert-Score'))
# df.shape

In [71]:
# Generate Labels first
df.loc[(df['label-changed'] == True) & (df['new-label-prob'] >= 0.8), 'flip-class'] = 'Certain-Flip'
df.loc[(df['label-changed'] == True) & (df['new-label-prob'] < 0.8), 'flip-class'] = 'Uncertain-Flip'
df.loc[(df['label-changed'] == False) & (df['new-label-prob'] >= 0.8), 'flip-class'] = 'Certain-Same'
df.loc[(df['label-changed'] == False) & (df['new-label-prob'] < 0.8), 'flip-class'] = 'Uncertain-Same'

# sort by Bert-Score and flip class
df = df.sort_values(by=['flip-class','Bert-Score'], ascending=False)

### Label "interesting" RTE data slices using slice function (SF) from Slice-based Learning/Polyjuice paper

In [79]:
# find "interesting" RTE data slices, add as filter
# use slices defined in paper. simple text matching.

temporal_prepositions = ["after", "before", "past"]
comparative_words = ["more", "less", "better", "worse", "bigger", "smaller"]
quantifiers = ["all", "some", "none"]
negation_words = ["no", "not", "none", "no one", "nobody", "nothing", "neither", "nowhere", "never", "hardly", "scarcely", "barely", "doesnt", "isnt", "wasnt", "shouldnt", "wouldnt", "couldnt", "wont", "cant", "dont"]

# if any slice words are in premise+hypothesis
total_SF = temporal_prepositions + comparative_words + quantifiers + negation_words

df['slice'] = (df['premise'] + df['hypothesis']).apply(lambda x: slice_function(total_SF, x))

NameError: name 'slice_function' is not defined

In [73]:
# outputting
df.to_csv('./MNLI/new/cleaned_' + files[file_num])

In [74]:
# for each Flip Class, get top 200 examples by Bert-Score, then sample 25 each.
sample_df = pd.DataFrame()

sample_df = sample_df.append(df[df['flip-class'] == 'Certain-Flip'].nlargest(200, 'Bert-Score').sample(25))
sample_df = sample_df.append(df[df['flip-class'] == 'Uncertain-Flip'].nlargest(200, 'Bert-Score').sample(25))
sample_df = sample_df.append(df[df['flip-class'] == 'Certain-Same'].nsmallest(200, 'Bert-Score').sample(25))
sample_df = sample_df.append(df[df['flip-class'] == 'Uncertain-Same'].nsmallest(200, 'Bert-Score').sample(25))

sample_df.to_csv('./MNLI/new/sample_' + files[file_num])

## Getting Sentence-Pair Classification Task

Simply counting the direction of label flip.

In [75]:
from ast import literal_eval as make_tuple
from collections import Counter

CS_data = df[df['label-changed'] == True]

# word changes that shift label
# words = [make_tuple(w1) for w1 in CS_data['token_changes']]
# change_count = Counter(words)

# count of masked words
# filled_words = [make_tuple(w1)[1] for w1 in CS_data['token_changes']]
# filled_count = Counter(filled_words)

# count of label switches in contrast set
switches = zip(df['orig-label'], df['new-label'])
switch_changes = Counter(switches)
print(switch_changes)

Counter({('contradiction', 'contradiction'): 903, ('neutral', 'neutral'): 820, ('entailment', 'entailment'): 445, ('contradiction', 'entailment'): 317, ('entailment', 'contradiction'): 124, ('contradiction', 'neutral'): 120, ('neutral', 'contradiction'): 78, ('neutral', 'entailment'): 54, ('entailment', 'neutral'): 37})


In [76]:
Counter(df['new-label'])

Counter({'contradiction': 1105, 'neutral': 977, 'entailment': 816})

In [77]:
Counter(df['orig-label'])

Counter({'contradiction': 1340, 'neutral': 952, 'entailment': 606})

In [78]:
switches = zip(df.loc[df['flip-class'] == 'Certain-Flip', 'orig-label'], df.loc[df['flip-class'] == 'Certain-Flip', 'new-label'])
switch_changes = Counter(switches)
print(switch_changes)

Counter({('contradiction', 'entailment'): 279, ('entailment', 'contradiction'): 109, ('contradiction', 'neutral'): 68, ('neutral', 'contradiction'): 65, ('neutral', 'entailment'): 32, ('entailment', 'neutral'): 19})
