In [2]:
import os,json,random,torch,csv,nltk,math,string,ast
from tqdm import tqdm_notebook
import numpy as np 
from dotted_dict import DottedDict
nltk.data.path.append('D:\\python_pkg_data\\nltk_data')
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from termcolor import colored
from transformers import AutoTokenizer,pipeline

os.environ['PYTHONHASHSEED'] = str(2020)
os.environ['TRANSFORMERS_CACHE'] = 'D:\\python_pkg_data\\huggingface\\transformers'
np.random.seed(2020)
random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [4]:
def Make_masked_text(token_text,replace_id,tokenizer):
    
    token_text[replace_id] = tokenizer.mask_token
    mask_text = TreebankWordDetokenizer().detokenize(token_text)
    return mask_text

def Get_synonyms(mask_text,unmasker,ban_list):
    
    synonym_list = []
    
    for item in unmasker(mask_text):
        synonym = item['token_str'].lower().strip()
        if synonym not in ban_list:
             synonym_list.append(item['token_str'].strip())
    

    return synonym_list

def Generate_fales_rationales(text,unmasker,tokenizer,false_rationale_span,ban_list,num_candidates=5):

    candidates = []
    token_text = word_tokenize(text)
    
    punc_ids = [idx for idx,token in enumerate(token_text) if token in string.punctuation]
    false_rationale_ids = list(set(false_rationale_span)-set(punc_ids))
    
    
    
    synonyms = {}
   
    
    for i in tqdm_notebook(range(num_candidates)):
        ## ensure the tokens being replaced are not punctuations
        token_candidate = token_text.copy()
        
      
           
                
        for replace_id in false_rationale_ids:
            if replace_id in synonyms.keys():
                if len(synonyms[replace_id])>=1:
                    synonym = random.choice(synonyms[replace_id])
                    token_candidate[replace_id] = synonym
                else:
                    pass
            else:
                mask_text = Make_masked_text(token_text.copy(),replace_id,tokenizer)
              
                synonyms[replace_id] = Get_synonyms(mask_text,unmasker,ban_list+[token_text[replace_id]])
                
                if len(synonyms[replace_id])>=1:
                
                    synonym = random.choice(synonyms[replace_id])
                      
                    token_candidate[replace_id] = synonym
                    
                else:
                    pass
                
        candidates.append(TreebankWordDetokenizer().detokenize(token_candidate))
        
        
    return candidates




def Visualise_rationales(original,rationale_spans,rationale_pos,visualise_all=False):
    
    if visualise_all:
        highlighted = []
        for idx,term in enumerate(word_tokenize(original)):
            if idx in rationale_pos:
                highlighted.append(colored(term,'blue'))
            else:
                highlighted.append(term)
            
        return TreebankWordDetokenizer().detokenize(highlighted)
                
    else:
        highlights = []
        for span in rationale_spans:
            highlighted = []
            for idx,term in enumerate(word_tokenize(original)):
                if idx in span:
                    highlighted.append(colored(term,'blue'))
                else:
                    highlighted.append(term)
                    
            highlights.append(TreebankWordDetokenizer().detokenize(highlighted))
        
        return highlights
    


In [5]:
args = {
    'ori_train_dir':'./datasets/IMDb/orig/train.tsv',
    'positive_dir':'./datasets/positive-words.txt',
    'negative_dir':'./datasets/negative-words.txt',
    'train_random_seed':2019,                                        ## random seed for subsampling training set
    'num_per_class': 25,                                              ## number of examples per class for initial training set
    'save_dir': './SF_results/IMDb_step0_sf_trainer',                                   ##directory for saving models              
    'num_per_example':7
}

## Training data augmented by false rationales

In [32]:
from transformers import BartForConditionalGeneration, BartTokenizer

## import train data
IMDb_data = {}

with open(args['ori_train_dir'],errors='ignore') as file:
    file = csv.reader(file, delimiter="\t")
    for idx,row in enumerate(file):
        if len(row)>0:

            if row[0] == 'Negative':
                IMDb_data[row[2]] = {'text':row[1],'label':0}
            else:
                IMDb_data[row[2]] = {'text':row[1],'label':1}

In [33]:
## load false rationales positions
false_rationales_dir = f"{args['save_dir']}_{args['train_random_seed']}_{args['num_per_class']}_{args['num_per_example']}/false_rationles.tsv"

In [34]:
false_rationales = {}
with open(false_rationales_dir,'r',errors='ignore') as file:
    file = csv.reader(file, delimiter='\t')
    for idx,row in enumerate(file):
        if idx!=0:
            false_rationales[row[0]] = ast.literal_eval(row[1])
          
positive_terms = []            
with open(args['positive_dir'],'r') as file:
    positive_terms=file.read().splitlines()
    
negative_terms = []            
with open(args['negative_dir'],'r') as file:
    negative_terms=file.read().splitlines()

# visualise false rationales spans

In [35]:
doc_keys = list(false_rationales.keys())

exceptions= []
for doc_id in doc_keys:

    try:
        print(f'doc_id:{doc_id}')
        ## select text
        original = IMDb_data[doc_id]['text']
        token_original = word_tokenize(original)
        false_rationale_span = false_rationales[doc_id]


        highlighted = Visualise_rationales(original,_,false_rationale_span,visualise_all=True)
        print(highlighted)
        print('*'*100)
#         print(highlighted[0])
    except Exception:
        exceptions.append(doc_id)

doc_id:8518
If there's one thing you can count on Disney to do, it's their uncanny ability to take a story and tell it again and again and again [34m.[0m [34mEven[0m watching the commercial for Lady and the Tramp II was a horrible experience . Disney's going to ruin one of their most awesome classics ever . [34mIt[0m even had that spaghetti meatball scene . It's been done before! And that's what I say to this sorry direct to video (the entire concept should be banned). Everything is just a rehash of the original movie and even several of Bluth's really bad movies . The penguin and walrus duo (I've even forgotten their names) are just a really poor carbon copy of Timon and Pumbaa . Morgana is another Ursula . She even repeats practically all her old lines . The songs are pathetic, really abysmal . I've never heard songs so bad from them before until now . And the dialogue is atrocious . It's pathetic and simplistic . On the plus side, at least they took the time to make the animat

This move was [34mon[0m [34mTV[0m [34mlast[0m [34mnight[0m . I [34mguess[0m as a time filler, because it sucked bad! The movie is just an excuse to show some tits and ass at the start and somewhere about half way . [34m([0m Not bad tits and ass though) [34m.[0m [34mBut[0m the story is too ridiculous for words . The "wolf", if that is what you can call it, is hardly shown fully save his teeth . When it is fully in view, you can clearly see they had some interns working on the CGI, because the wolf runs like he's running in a treadmill, and the CGI fur looks like it's been waxed, all shiny:) The [34mmovie[0m [34mis[0m full of gore and blood, and you can easily spot who is going to get killed/slashed/eaten next . Even if you like these kind of splatter movies you will be disappointed, they didn't do a good job at it . Don't even get me started on the actors...Very corny lines and the girls scream at everything about every 5 seconds . [34mBut[0m then again, if someone

had to see this [34mcos[0m it looked like a great scary premise- prisoners finding magic book, oo err! claustrophobic terror ensues, etc . but there didn't seem to be a story to go along with the great idea . [34mrather[0m than [34mchilling/physcological[0m [34mhorror[0m [34m,[0m it relied on effects out in the open- fire and OTT body horror-, which didn't scare at all if your over 12 . The logic at the [34mend[0m is ridiculous, with characters being killed off for nothing other than bodycount . waste of good characters- which were the best thing about this film . [34mobviously[0m low budget, which doesn't spoil it, the film really goes nowhere, and- icant believe im going to say this- it [34mneeds[0m a Hollywood remake . you simply loose interest in this version . definitely not in the same league as other french films coming out in the last few years like crimson rivers [34mwhich[0m were at least watchable/entertaining, malefique isn't watchable to the end to be hon

In [36]:
unmasker = pipeline('fill-mask', model='roberta-base',top_k=15,device=0)
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [37]:
## an example of generate false rationale augmented examples by RoBERTa
key_id = doc_keys[0]
ban_list = positive_terms + negative_terms
# ban_list = []
text = IMDb_data[key_id]['text']
candidates = Generate_fales_rationales(text,unmasker,tokenizer,false_rationales[key_id],ban_list,num_candidates=7)

print(candidates[:3])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

["If there's one thing you can count on Disney to do, it's their uncanny ability to take a story and tell it again and again and again . Just watching the commercial for Lady and the Tramp II was a horrible experience . Disney's going to ruin one of their most awesome classics ever . Never even had that spaghetti meatball scene . It's been done before! And that's what I say to this sorry direct to video (the entire concept should be banned). Everything is just a rehash of the original movie and even several of Bluth's really bad movies . The penguin and walrus duo (I've even forgotten their names) are just a really poor carbon copy of Timon and Pumbaa . Morgana is another Ursula . She even repeats practically all her old lines . The songs are pathetic, really abysmal . I've never heard songs so bad from them before until now . And the dialogue is atrocious . It's pathetic and simplistic . On the plus side, at least they took the time to make the animation somewhat decent . All of the u

In [38]:
## generate false rationale augmented examples by RoBERTa
augmented_data = {}
for doc_id in tqdm_notebook(doc_keys):
    ori_text = IMDb_data[doc_id]['text']
    ori_label = IMDb_data[doc_id]['label']
    
    false_rationale_span =false_rationales[doc_id]
    
    candidates = Generate_fales_rationales(ori_text,unmasker,tokenizer,false_rationale_span,ban_list,num_candidates=16)
    
    augmented_data[doc_id] = {'candidates':candidates,'label':ori_label}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/100 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [6]:
output_dir =  f"{args['save_dir']}_{args['train_random_seed']}_{args['num_per_class']}_{args['num_per_example']}/false_rationales_augmented_step1.json"

In [7]:
# with open(output_dir, "w") as file_name:
#     json.dump(augmented_data, file_name)
 
with open(output_dir, "r") as file_name:
    augmented_data = json.load(file_name)