# Claim Generation, SciFact


In [1]:
import os
import subprocess
import transformers
from datasets import load_dataset, load_metric, DatasetDict, load_from_disk, Dataset
from tqdm import tqdm
import json
import pandas as pd
import random
import copy
import hashlib

# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
#To get random evidence (instead of gold one), set to True
randomize_from_fileseed=True

In [4]:
#Path for multivers clone
base_path_multivers='/homes/bussotti/feverous_work/multivers/'

#Path for output of experiments
base_path='/homes/bussotti/feverous_work/feverousdata/'

corpus_path='data_train/target/scifact_10/corpus.jsonl'

# Scifact data load

In [7]:
corpus=dict()
corpusjsonf=open(base_path_multivers+corpus_path)

#Converting the corpus to a dictionary
for line  in corpusjsonf:
    elt=json.loads(line)
    corpus[elt['doc_id']]={'title':elt['title'],'abstract':elt['abstract']}
corpusjsonf.close()

In [8]:
#Getting the seed claims to be used for training
train_corp_sci_path='data_train/target/scifact_10_original/claims_train.jsonl'
train_corp_sci_f=open(base_path_multivers+train_corp_sci_path)
train_corp_sci=[]
for line in train_corp_sci_f:
    train_corp_sci+=[json.loads(line)]
    
train_corp_sci_f.close()

In [9]:
#We remove claims with no evidence
train_corp_sci_filtered=[x for x in train_corp_sci if len(list(x['evidence']))>0]

In [16]:
train_corp_sci_filtered_pos=[]
train_corp_sci_filtered_neg=[]
nbevidencestats=dict()
#we split train between supports and refutes, and we get some statistics
for elt in train_corp_sci_filtered:
    labelforelt=''
    nbevhere=0
    for ev in elt['evidence'].keys():
        for evinp in elt['evidence'][ev]:
            nbevhere+=1
            if labelforelt=='error':
                continue
            elif labelforelt=='':
                labelforelt=evinp['label']
            elif not labelforelt==evinp['label']:
                print('problem')
                labelforelt='error'
    if nbevhere in nbevidencestats.keys():
        nbevidencestats[nbevhere]+=1
    else:
        nbevidencestats[nbevhere]=1
    elt['nbevhere']=nbevhere
    if labelforelt=='CONTRADICT':
        train_corp_sci_filtered_neg+=[elt]
    elif labelforelt=='SUPPORT':
        train_corp_sci_filtered_pos+=[elt]

In [17]:
nbevidencestats

{1: 355, 2: 200, 3: 79, 4: 35, 5: 8, 9: 2, 7: 3, 11: 2, 6: 5, 8: 4}

In [18]:
len(train_corp_sci_filtered)

693

In [19]:
len(train_corp_sci_filtered_pos)

456

In [21]:
len(train_corp_sci_filtered_neg)

237

In [20]:
def get_sentence_sci(doc_id,sentence_nb):
    #Get all the sentences in one document
    return corpus[doc_id]['abstract'][sentence_nb]

def get_nb_sentences_sci(doc_id):
    #Get the number of sentences in one document
    return len(corpus[doc_id]['abstract'])

def get_title_sci(doc_id):
    #Get title of a document
    return corpus[doc_id]['title']

In [24]:
respos=[]
resneg=[]
allevspos=[]
allevsneg=[]
res_for_modelpos=[]
res_for_modelneg=[]
#We build our evidence set for claim generation
for data,res,allevs,res_for_model in [[train_corp_sci_filtered_pos,respos,allevspos,res_for_modelpos],[train_corp_sci_filtered_neg,resneg,allevsneg,res_for_modelneg]]:
    for elttmp in data:
        
        elt=copy.deepcopy(elttmp)
        evfortxtgen=[]
        title=[]
        for page in elt['evidence'].keys():
            title+=[get_title_sci(int(page))]
            for ev in range(len(elt['evidence'][page])):
                lenevtoadd=len(elt['evidence'][page][ev]['sentences'])
                nbsentencestot=get_nb_sentences_sci(int(page))
                #If we have to randomize, we select random evidence from document with the same number as original
                if randomize_from_fileseed:
                    elt['evidence'][page][ev]['sentences']=random.sample(list(range(nbsentencestot)),lenevtoadd)
                for evtoadd in elt['evidence'][page][ev]['sentences']:
                    #from id to text
                    evfortxtgen+=[get_sentence_sci(int(page),evtoadd)]
                
        title=' | '.join(title)
        elt['claim']='TODO'
        elt['title']=title
        elt['evfortxtgen']=evfortxtgen
        res_for_model+=[{"evidences_txt":evfortxtgen, "claim":'', "title":elt['title']}]
        res+=[elt]
        allevs+=[evfortxtgen]
                

In [31]:
res_for_modelpos=res_for_modelpos[:237]
res_for_modelneg=res_for_modelneg[:237]
respos=respos[:237]
resneg=resneg[:237]

In [34]:
dspos=Dataset.from_pandas(pd.DataFrame(data=res_for_modelpos))
dsneg=Dataset.from_pandas(pd.DataFrame(data=res_for_modelneg))

In [35]:
emptyds=Dataset.from_pandas(pd.DataFrame(data=[{"evidences_txt":'', "claim":'', "title":''}]))

In [37]:
datasetpos = DatasetDict({
    "train": dspos,
     "validation": emptyds,
     "test": emptyds,
})
datasetneg = DatasetDict({
    "train": dsneg,
     "validation": emptyds,
     "test": emptyds,
})
datasetpos.save_to_disk(base_path+'dstopredictfromrandomnewpos')
datasetneg.save_to_disk(base_path+'dstopredictfromrandomnewneg')

Saving the dataset (0/1 shards):   0%|          | 0/237 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/237 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

In [38]:
nb_samples_train=237 #The number of sanples we want to use for training in make_sh

In [39]:
def generate_file_name_from_params(params):
    #When we finetune our models or run inference on them,we give them a unique name based on parameters
    return 'ds_'+'_'.join([x+'_'+params[x].replace('/','Z').replace('_','T') for x in params.keys() if not(params[x]=='') and not(x in ['output','localmodel','outputdir'])])


def generate_hash_file_name_from_params(params):
    #As this name can be very long we hash it
    hash_object = hashlib.sha1(generate_file_name_from_params(params).encode('utf-8'))
    hex_dig = hash_object.hexdigest()
    return str(hex_dig)

def make_sh(params):
    #We create a script for running train/inference on our target model
    base_str=[\
        '#!/bin/bash', \
        '/home/bussotti/.conda/envs/feverous2/bin/python main.py \\' , \
        '--model_name_or_path '+ ('/data/' if params['localmodel'] else '')+ params['modeltouse'] +' \\' , \
        '--dataset_name_local '+ params['dsnamelocal'] +' \\' , \
        '--log_level error \\' , \
        '--only_train \\' , \
        '--output_dir \'/data/'+ params['outputdir'] +'\' \\' , \
        '--per_device_train_batch_size 16 \\' , \
        '--per_device_eval_batch_size 8 \\' , \
        '--gradient_accumulation_steps 1 \\' , \
        '--max_source_length 512 \\' , \
        '--min_target_length 30 \\' , \
        '--max_target_length 64 \\' , \
        '--generation_max_length 64 \\' , \
        '--num_train_epochs 20 \\' , \
        '--learning_rate 1e-4 \\' , \
        '--save_strategy epoch \\' , \
        '--evaluation_strategy epoch \\' , \
        '--fp16 \\' , \
        '--load_best_model_at_end \\' , \
        '--predict_with_generate \\' , \
        '--overwrite_output_dir \\' , \
        '--metric_for_best_model eval_rouge1 \\' , \
        '--save_total_limit 1 \\' , \
        '--num_beams 5 \\' , \
        '--generation_num_beams 2 \\' , \
        '--group_by_length \\' , \
        '--sortish_sampler \\' , \
        '--weight_decay 0.01 \\' , \
        '--label_smoothing_factor 0.1 \\' , \
        '--include_inputs_for_metrics \\' , \
        '--overwrite_cache \\' , \
        '--gradient_checkpointing \\' , \
        '--remove_unused_columns \\' , \
        '--max_eval_samples 200 \\' , \
        '--predict_samples_from_train '+str(nb_samples_train)+' \\'] 
    if 'pathtxt' in params.keys() and not params['pathtxt']=='':
        base_str+=['--from_path /data/'+params['pathtxt']+' \\']
    if 'maxnb' in params.keys() and not params['maxnb']=='':
        base_str+=['--max_train_samples '+params['maxnb']+' \\']
        
    if 'change_subset' in params.keys() and not params['change_subset']=='':
        base_str+=['--change_subset \\']
    
    if params['step']=='train':
        base_str+=['--do_train']
    elif params['step']=='test':
        base_str+=['--do_predict']
    else:
        print('error 0')
    
    return base_str
    
    
def prepare_run_sh(sh_txt,folder_used):
    #From make_sh, we need to make the script runnable and run it
    f=open(base_path+'run_auto.sh','w')
    for elt in sh_txt:
        f.write(elt+'\n')
    f.close()
    outputs=[]
    errors=[]
    list_commands=[
        ["chmod a+wrx run_auto.sh",base_path],
        ["./run_auto.sh",base_path],

    ]
    for elt in list_commands:
        print('##########')
        print(elt)
        bashCommand = elt[0]
        cwd = elt[1]
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE,stderr=subprocess.STDOUT, cwd=cwd)
        output, error = process.communicate()
        outputs+=[output]
        errors+=[error]
        
        print(output)
        print(error)
    return outputs,error
    


In [40]:
def pipeline_training_claimgenerator(list_execs):
    #Function that run all training/inference for a list_execs object containing details about each train/inference
    outputs,errors=[],[]
    outposname,outnegname='',''
    total_time=0
    for nb_exec in range(len(list_execs)):
        if type(list_execs[nb_exec]['modeltouse'])==int:
            if list_execs[nb_exec]['modeltouse']<0:
                list_execs[nb_exec]['modeltouse']=nb_exec+list_execs[nb_exec]['modeltouse']
            list_execs[nb_exec]['modeltouse']=list_execs[list_execs[nb_exec]['modeltouse']]['outputdir']
            
            
        if type(list_execs[nb_exec]['pathtxt'])==int:
            if list_execs[nb_exec]['pathtxt']<0:
                list_execs[nb_exec]['pathtxt']=nb_exec+list_execs[nb_exec]['pathtxt']
            list_execs[nb_exec]['pathtxt']=list_execs[list_execs[nb_exec]['pathtxt']]['outputdir']+'/predictions.txt'
            
            
        
        list_execs[nb_exec]['outputdir']=generate_hash_file_name_from_params(list_execs[nb_exec])
        finalname=list_execs[nb_exec]['outputdir']
        oldfinalname=generate_file_name_from_params(list_execs[nb_exec])
        print(oldfinalname + ' hash : '+str(finalname))
        if 'output' in  list_execs[nb_exec].keys():
            print('èèèèèè output type:'+list_execs[nb_exec]['output'])
        if list_execs[nb_exec]['step']=='test' :
            if os.path.exists('/data/'+finalname+'/predictions.txt'):
                ####This model is already trained, ok
                print(finalname+' already done ')
            elif os.path.exists('/data/'+oldfinalname+'/predictions.txt'):
                ####This model is already trained, ok
                print(finalname+' already done, old input format '+oldfinalname)
                list_execs[nb_exec]['outputdir']=oldfinalname
                finalname=oldfinalname
            else :
                sh_txt=make_sh(list_execs[nb_exec])
                output,error=prepare_run_sh(sh_txt, finalname)
                outputs+=[list_execs[nb_exec],output]
                errors+=[list_execs[nb_exec],errors]
                if not os.path.exists('/data/'+finalname+'/predictions.txt'):
                    ####problem
                    print('error 3')
                    return -1
            f=open('/data/'+finalname+'/predict_results.json')
            obj_tmp=json.load(f)
            f.close()
            total_time+=obj_tmp['predict_runtime']
            if 'output' in  list_execs[nb_exec].keys() and list_execs[nb_exec]['output']=='pos':
                outposname='/data/'+finalname+'/predictions.txt'
            if 'output' in  list_execs[nb_exec].keys() and list_execs[nb_exec]['output']=='neg':
                outnegname='/data/'+finalname+'/predictions.txt'
            
        elif list_execs[nb_exec]['step']=='train':
            if os.path.exists('/data/'+finalname+'/pytorch_model.bin'):
                ####This model is already trained, ok
                print(finalname+' already done ')
            elif os.path.exists('/data/'+oldfinalname+'/pytorch_model.bin'):
                ####This model is already trained, ok
                print(finalname+' already done, old input format '+oldfinalname)
                list_execs[nb_exec]['outputdir']=oldfinalname
                finalname=oldfinalname
            else:
                
                sh_txt=make_sh(list_execs[nb_exec])
                output,error=prepare_run_sh(sh_txt, finalname )
                outputs+=[list_execs[nb_exec],output]
                errors+=[list_execs[nb_exec],errors]
                if not os.path.exists('/data/'+finalname+'/pytorch_model.bin'):
                    ####problem
                    print('error 3')
                    return -1
            f=open('/data/'+finalname+'/all_results.json')
            obj_tmp=json.load(f)
            f.close()
            total_time+=obj_tmp['train_runtime']
            
        else:
            print('error 1')
            return -1
    print('list_execs')
    print(list_execs)
    print('outputs')
    print(outputs)
    print('errors')
    print(errors)
    return outposname,outnegname, total_time
    

In [41]:
# Uniaue subsetname for our experiments, used to retrain again on model on a different dataset with identical name
subsetname='sci_claims_randomevbartneg_cold'

In [42]:
dico_possibilities=[]


#Warm fever_ft_all True, nb_to_train_on_feverous_all 100
#Cold fever_ft_all False, nb_to_train_on_feverous_all 0

for fever_ft_all in [True]: #Cold set thisto False, Warm set to True
    #True,False
    #Run a First training on  Fever
    for nb_to_train_on_feverous_all in['100']: #0 =Cold, Warm set to 100
        # ['0','10','100','']:#''==all
        for positive_model in ['bartlarge']:
            #bartlarge or 'bartlarge-xsum'
            positive_model_real={'bartlarge':'facebook/bart-large','bartlarge-xsum':'facebook/bart-large-xsum'}[positive_model]
            for negative_model in ['bartbase-WikiFactEnglish']:
                #'bartbase-WikiFactEnglish','bartlarge+bartneg','bartlarge'
                negative_model_real={'bartlarge':'facebook/bart-large','bartbase-WikiFactEnglish':'minwhoo/bart-base-negative-claim-generation','bartlarge+bartneg':'minwhoo/bart-base-negative-claim-generation'}[negative_model]
                if negative_model in ['bartlarge','bartlarge+bartneg'] and not positive_model=='bartlarge':
                    continue
                if fever_ft_all==True and positive_model=='bartlarge-xsum':
                    continue
                if negative_model=='bartbase-WikiFactEnglish' and not positive_model=='bartlarge':
                    continue
                tmp=[]
                if fever_ft_all:
                    tmp+=[
                        {'modeltouse':positive_model_real,'dsnamelocal':'fever_traindev_10k_pos','pathtxt':'','maxnb':'','step':'train', 'localmodel':False},
                        {'modeltouse':negative_model_real,'dsnamelocal':'fever_traindev_10k_neg','pathtxt':'','maxnb':'','step':'train', 'localmodel':False},
                    ]
                if not nb_to_train_on_feverous_all=='0':
                    tmp+=[
                        {'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'feverous_pos','pathtxt':'','maxnb':nb_to_train_on_feverous_all,'step':'train', 'localmodel': fever_ft_all},
                        {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewpos','pathtxt':'','maxnb':'','step':'test','output':'pos','localmodel':True, 'change_subset':subsetname}]
                else:
                    tmp+=[
                        {'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'dstopredictfromrandomnewpos','pathtxt':'','maxnb':'','step':'test','output':'pos','localmodel':fever_ft_all, 'change_subset':subsetname}]
                
                if negative_model=='bartlarge+bartneg':
                    if not nb_to_train_on_feverous_all=='0':
                        tmp+=[{'modeltouse':-2,'dsnamelocal':'raw_dataset_random_neg','pathtxt':'','maxnb':'','step':'test','localmodel':True},
                            {'modeltouse':1 if fever_ft_all else negative_model_real,'pathtxt':'','dsnamelocal':'feverous_neg','maxnb':nb_to_train_on_feverous_all,'step':'train','localmodel':fever_ft_all},
                            {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewneg','maxnb':'','pathtxt':-2,'step':'test','output':'neg','localmodel':True, 'change_subset':subsetname} ]
                    else:
                        tmp+=[{'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'raw_dataset_random_neg','pathtxt':'','maxnb':'','step':'test','localmodel':fever_ft_all},
                            {'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','pathtxt':-1,'step':'test','output':'neg','localmodel':fever_ft_all, 'change_subset':subsetname} ]
                    
                else:
                    if not nb_to_train_on_feverous_all=='0':
                        tmp+=[{'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'feverous_neg','pathtxt':'','maxnb':nb_to_train_on_feverous_all,'step':'train','localmodel':fever_ft_all},
                            {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','step':'test','output':'neg','localmodel':True, 'change_subset':subsetname}]
                    else:
                        tmp+=[{'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','step':'test','output':'neg','localmodel':fever_ft_all, 'change_subset':subsetname}]
               
                tmp2=copy.deepcopy(tmp)
                dico_possibilities+=[{'feverftall':fever_ft_all,'nbtotrainonfeverousall':nb_to_train_on_feverous_all,'positivemodel':positive_model,'negativemodel':negative_model,'listexecs':tmp2}]
    


In [50]:

#Output folder name
testablefolder='jf251023_txt_scifact_randomev_bartneg_cold'

#For each experiment
for nb_exec, elt in enumerate(dico_possibilities):
    print(f'UUUUUUUUUUUUUUUUUUUUUUUUUUUU{nb_exec}')
    print('$$$$$$$$$$$$$$$$$$$')
    print(elt)
    print('$$$$$$$$$$$$$$$$$$$')
    #We run the whole pipeline of executions for this experiment
    # We get the path to prediction and total_time
    outposname,outnegname, total_time=pipeline_training_claimgenerator(elt['listexecs'])
    # We convert the predictions to the runnable file
    
    ##############
    
         
    print(elt)
    
    nc=outposname
    ncn=outnegname
    new_claims_pos=[]
    new_claims_neg=[]
    f_newclaims=open(nc)

    f_newclaims_neg=open(ncn)

    pos_same_seed_neg=False

    #we create the input file in the scifact format
    if not os.path.exists(base_path+'/'+testablefolder):
        os.mkdir(base_path+'/'+testablefolder)
        
    for line in f_newclaims:
        new_claims_pos+=[line.replace('\n','')]

    for line in f_newclaims_neg:
        new_claims_neg+=[line.replace('\n','')]
    for dstom,claimstouse,label in [[respos,new_claims_pos,'SUPPORTS'],[resneg,new_claims_neg,'REFUTES']]:
        for nb,elt_c in enumerate(claimstouse):
            if 'claim' in claimstouse:
                dstom[nb]['original_claim']=dstom[nb]['claim']
            dstom[nb]['claim']=elt_c
            dstom[nb]['label']=label
    resf=respos+resneg

    if elt['nbtotrainonfeverousall']=='':
        elt['nbtotrainonfeverousall']='all'

    #We create a parameters file containing information on the claims generated
    params_to_write=copy.deepcopy(elt)
    

    params_to_write['pos_len']=len([x for x in resf if x['label']=='SUPPORTS'])
    params_to_write['total_len']=params_to_write['pos_len'] if pos_same_seed_neg else len(resf)
    params_to_write['neg_len']=len([x for x in resf if x['label']=='REFUTES'])
    params_to_write['randomize_from_fileseed']=randomize_from_fileseed
    params_to_write['randomize_strategy']=randomize_strategy
    params_to_write['randomize_nbevidence']=randomize_nbevidence
    params_to_write['outposname']=outposname
    params_to_write['outnegname']=outnegname
    
    base_fn='ftfever_'+('1' if elt['feverftall'] else '0') + '_feveroustrainnb_'+str(elt['nbtotrainonfeverousall'])+'_'+elt['positivemodel']+'_'+str(params_to_write['pos_len'])+'_pos_'+elt['negativemodel']+'_'+str(params_to_write['neg_len'])
    if os.path.exists(base_path+'/'+testablefolder+'/'+base_fn+'_params.json'):
        print('file already done')
    else:
        params_to_write['total_time']=total_time
        params_to_write['supportnli']='NA'
        params_to_write['refutesnli']='NA'




        f4=open(base_path+'/'+testablefolder+'/'+base_fn+'_params.json','w')
        json.dump(params_to_write,f4)

        f4.close()


        f4=open(base_path+'/'+testablefolder+'/'+base_fn+'.jsonl','w')
        for eltb in resf:
            f4.write(json.dumps(eltb)+'\n')

        f4.close()
    
   
    

UUUUUUUUUUUUUUUUUUUUUUUUUUUU0
$$$$$$$$$$$$$$$$$$$
{'feverftall': False, 'nbtotrainonfeverousall': '0', 'positivemodel': 'bartlarge', 'negativemodel': 'bartbase-WikiFactEnglish', 'listexecs': [{'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewpos', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'pos', 'localmodel': False, 'change_subset': 'sci_claims_randomevbartneg_cold'}, {'modeltouse': 'minwhoo/bart-base-negative-claim-generation', 'dsnamelocal': 'dstopredictfromrandomnewneg', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'neg', 'localmodel': False, 'change_subset': 'sci_claims_randomevbartneg_cold'}]}
$$$$$$$$$$$$$$$$$$$
ds_modeltouse_facebookZbart-large_dsnamelocal_dstopredictfromrandomnewpos_step_test_change_subset_sciTclaimsTrandomevbartnegTcold hash : 0f06f86ef8b8bf257adae5a12d39d9d28ba4a2a2
èèèèèè output type:pos
##########
['chmod a+wrx run_auto.sh', '/homes/bussotti/feverous_work/feverousdata/']
b''
None
##########
['./run_auto.sh', 