# Training an Article Title Generation Model with T5

In [1]:
import os
import subprocess
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from tqdm import tqdm
import json
import transformers
from datasets import load_dataset, load_metric, Dataset, DatasetDict, load_from_disk
import traceback
from database.feverous_db import FeverousDB
from utils.wiki_page import WikiPage
from tqdm import tqdm
import copy
import random
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import transformers
import pandas as pd
import hashlib

In [None]:
#Path for output of experiments
base_path=''
#The path for the database
db_path='/data/jf/feverous_wikiv1.db'

#The file containing human claims
inputjsonlfile=base_path+'feverous_train_challenges_sentencesandtable.jsonl'
#For table + text.

#inputjsonlfile=base_path+'feverous_train_challenges_sentencesonly.jsonl'
#For text only


#To get random evidence (instead of gold one), set to True
randomize_from_fileseed=True
#The randomize strategy, random or similarembedding
randomize_strategy='randomtbtxt'#'randomtxt'#'random_similarembedsentences'
#The number of evidence, gold or a distribution
randomize_nbevidence='gold'#{'sentencesnb':[1,1,2,2,2,3,3,4,5]}#,'cellsnb':[2,3,3,4,4,4,5,5,6,6,7,8]}#'gold'
#When modifying the input data, change the subset in order to regenerate new claims/models and consider them different
change_subset='210923goldevtbtxt'
namecomp=change_subset

#Name for the output folder
testablefolder='210923goldevtbtxt'

###Use the same seed examples than in another set
input_fileseed=False
#The claim file that you want to use to restrain the input seed
file_seed=None#base_path+'jf030723originalcorrected/ftfever_1_feveroustrainnb_100_bartlarge_1500_pos_bartbase-WikiFactEnglish_1500.jsonl'#'jf210623/ftfever_1_feveroustrainnb_100_bartlarge_1500_pos_bartbase-WikiFactEnglish_1500.jsonl'


In [2]:
def replace_text2(text):
    #A lot of evidence in the dataset contain reference in the form [[reference|text]]. We want to give the claim generator only the text part
    #We use regular expression for this
    pattern = r'\[\[.*?\|(.*?)\]\]'
    result = re.sub(pattern, r'\1', text)
    return result

In [43]:
model_name = "t5-large-totto"
#The directory were is located the unzipped t5large
model_dir = "/data/jf/t5-large-totto/"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [44]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model     = model.to(device)

In [8]:

obj=[]
original_params=''

if input_fileseed:
    #We restrain to the fileseed provided
    f=open(file_seed)
    for line in f:
        
        objtmp=json.loads(line)
        if 'original_claim' in objtmp.keys():
            objtmp['claim']=objtmp['original_claim']
        obj+=[objtmp]
    f.close()
    f=open(file_seed.replace('.jsonl','_params.json'))
    original_params=f.readline()
    f.close()
else:        


    f=open(inputjsonlfile)

    for line in f:
        obj+=[json.loads(line)]

    f.close()

print(len(obj))

18756


In [162]:
#To reverse and get new subset

#obj=obj[::-1]
#reversesubset=True


reversesubset=False

In [9]:

# Get the current working directory
cwd = os.getcwd()
os.chdir(base_path+"feverous")

# Print the current working directory
print("Current working directory: {0}".format(cwd))



db=FeverousDB(db_path)


def get_context_nit(claim):
    #Get context from a claim
    ctxt=[]
    title=''
    section=''
    section_nb=-1
    
    for elt in claim['evidence']:
        for elt2 in elt['context']:
            for elt3 in elt['context'][elt2]:
                if 'cell_' in elt3 :
                    continue 
                page_to_request=str(elt3.split('_')[0])
                page_json = db.get_doc_json(page_to_request)
                wiki_page = WikiPage(page_to_request, page_json)

                ev='_'.join(elt3.split('_')[1:])
                res_txt=wiki_page.get_element_by_id(elt3)
                
                wiki_tables = wiki_page.get_tables()
                if not ev=='title':
                    res_txt=str(wiki_page.page_items[ev])
                    if 'section' in ev:
                        section=str(wiki_page.page_items[ev])
                        #print(ev)
                        section_nb=ev.split('_')[1]
                else:
                    title=page_to_request
                    res_txt=page_to_request
                ctxt+=[res_txt]

    return list(set(ctxt)),title,section,section_nb

Current working directory: /homes/bussotti/feverous_work/t5trainingexp


In [10]:
def get_table(page_name, table_nb):
    #Get table given name and page number
    page_json = db.get_doc_json(page_name)
    wiki_page = WikiPage(page_name, page_json)

    wiki_tables = wiki_page.get_tables() #return list of all Wiki Tables

    return [[str(y) for y in x.get_row_cells()] for x in wiki_tables[table_nb].get_rows()]

In [11]:
def get_table_ids(page_name, table_nb):
    #Get ids of all cells
    page_json = db.get_doc_json(page_name)
    wiki_page = WikiPage(page_name, page_json)

    wiki_tables = wiki_page.get_tables() #return list of all Wiki Tables

    return [[y.get_id() for y in x.get_row_cells()] for x in wiki_tables[table_nb].get_rows()]

In [12]:
def get_cells_and_headers(page_name, table_nb):
    #Dictionary with headers of each cells
    table_id=get_table_ids(page_name, table_nb)
    dico_cells={}
    for row_id,row in enumerate(table_id):
        for col_id,col in enumerate(row):
            if not 'header_cell' in col:
                row_h=[page_name+'_'+x for x in row if 'header_cell' in x]
                col_h=[page_name+'_'+x[col_id] for x in table_id if 'header_cell' in x[col_id]]
                dico_cells[page_name+'_'+col]=[page_name+'_title']+row_h+col_h
    return dico_cells
            

In [13]:
def get_sentences(page_name):
    #Get all ids sentences of a page, ids and their context (title)
    page_json = db.get_doc_json(page_name)
    return {page_name+'_'+x:[page_name+'_title'] for x in page_json['order'] if 'sentence_' in x}

In [14]:
def get_sentences_value_list(page_name):
    #Get all values sentences of a page
    page_json = db.get_doc_json(page_name)
    return [replace_text2(page_json[x]) for x in page_json['order'] if 'sentence_' in x]

In [15]:
def get_pos(txt):
    #Get position of a cell in a table given its id
    return [int(x) for x in txt.split('_')[-2:]]

def get_tablenb(txt):
    #Get table number given an id
    return [int(x) for x in txt.split('_')[-3:]][0]

def get_title(txt):
    #Give title of a page given an evidence
    return [x for x in txt.split('_')][0]

In [18]:
def compare_embeddings(inputtb2txt,treesec):
    # Compare the input for table to text generation, to the tree of all sentences to get similar sentences
    
    best_sections_using_embeds=[]
    best_sentences_using_embeds=[]
    inputs1 = tokenizer(inputtb2txt, max_length=max_input_length, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings1 = model.encoder(input_ids=inputs1["input_ids"]).last_hidden_state[:, 0, :]
    for section in treesec.keys():
        if not section=='no_section':
            input2=treesec[section]['value']
        else:
            input2='General information'
        inputs2 = tokenizer(input2, max_length=max_input_length, truncation=True, return_tensors="pt")

        # Generate embeddings
        with torch.no_grad():
            embeddings2 = model.encoder(input_ids=inputs2["input_ids"]).last_hidden_state[:, 0, :]

        # Calculate cosine similarity
        similarity = cosine_similarity(embeddings1, embeddings2)
        cosinesimilsectionembed=similarity.item()
        treesec[section]['cosinesimilsectionembed']=cosinesimilsectionembed
        best_sections_using_embeds+=[[section,cosinesimilsectionembed]]
        for sentencenb,sentence in enumerate(treesec[section]['sentences']):
            sentence_val=sentence['value']
            sentence_id=sentence['id']
            inputs2 = tokenizer(sentence_val, max_length=max_input_length, truncation=True, return_tensors="pt")

            # Generate embeddings
            with torch.no_grad():
                embeddings2 = model.encoder(input_ids=inputs2["input_ids"]).last_hidden_state[:, 0, :]
            similarity = cosine_similarity(embeddings1, embeddings2)
            cosinesimilsentenceembed=similarity.item()
            best_sentences_using_embeds+=[[sentence_id,cosinesimilsentenceembed]]
            treesec[section]['sentences'][sentencenb]['cosinesimilsentenceembed']=cosinesimilsentenceembed
    elt=dict()
    elt['treesecupdated']=treesec
    best_sentences_using_embeds=sorted(best_sentences_using_embeds,key=lambda x: x[1],reverse=True)
    best_sections_using_embeds=sorted(best_sections_using_embeds,key=lambda x: x[1],reverse=True)
    elt['best_sentences_using_embeds']=best_sentences_using_embeds
    elt['best_sections_using_embeds']=best_sections_using_embeds
    return elt


In [19]:
def get_all_sections(page_name):
    #Return all the sections from a page
    page_json = db.get_doc_json(page_name)
    wiki_page = WikiPage(page_name, page_json)
    sentences_ids=[x for x in page_json['order'] if 'section_' in x]
    dico_sentence={x:page_json[x]['value'] for x in sentences_ids}
    return dico_sentence
    
def get_all_sentences(page_name):
    #Return all the sentences from a page
    page_json = db.get_doc_json(page_name)
    wiki_page = WikiPage(page_name, page_json)
    sentences_ids=[x for x in page_json['order'] if 'sentence_' in x]
    dico_sentence={x:replace_text2(page_json[x]) for x in sentences_ids}
    return dico_sentence
    
    
def get_tree_sec_sentences(page_name):
    #Create a tree containing all the sections of a page and their sentences
    page_json = db.get_doc_json(page_name)
    wiki_page = WikiPage(page_name, page_json)
    dicosec=dict()
    currentsection='no_section'
    currentlevel=None
    dicosec[currentsection]={'value':'Out of sections / Introduction / Summary','sentences':[]}
    ###No dealing of level
    for elt in page_json['order']:
        if 'section_' in elt:
            currentsection=elt
            dicosec[currentsection]={'value':replace_text2(page_json[elt]['value']),'sentences':[]}

        if 'sentence_' in elt:
            dicosec[currentsection]['sentences']+=[{'id':elt,'value':replace_text2(page_json[elt])}]
        
    return dicosec
    
def get_section_sentence(sentenceid,treesec):
    if not sentenceid.startswith('sentence_'):
        sentenceid=sentenceid.partition('_')
    for section in treesec.keys():
        for sentence in treesec[section]['sentences']:
            if sentence['id']==sentenceid:
                return section
    return -1
    

{'section_0': 'Early life and career',
 'section_1': 'Education',
 'section_2': 'Family and personal life',
 'section_3': 'Religious views',
 'section_4': 'Law career',
 'section_5': 'Community organizer and Harvard Law School',
 'section_6': 'University of Chicago Law School and civil rights attorney',
 'section_7': 'Legislative career',
 'section_8': 'Illinois State Senator (1997–2004)',
 'section_9': '2004 U.S. Senate campaign',
 'section_10': 'U.S. Senator from Illinois (2005–08)',
 'section_11': 'Legislation',
 'section_12': 'Committees',
 'section_13': 'Presidential campaigns',
 'section_14': '2008',
 'section_15': '2012',
 'section_16': 'President (2009–2017)',
 'section_17': 'First 100 days',
 'section_18': 'Domestic policy',
 'section_19': 'LGBT rights',
 'section_20': 'White House advisory and oversight groups',
 'section_21': 'Economic policy',
 'section_22': 'Environmental policy',
 'section_23': 'Health care reform',
 'section_24': 'Energy policy',
 'section_25': 'Gun cont

In [20]:
#For token computation purposes
model=model.to('cpu')

NameError: name 'model' is not defined

In [181]:
idstopop=[]
cntpos=0
cntneg=0
cnttostop=2000
#We randomize from file seed
if randomize_from_fileseed:
    if randomize_strategy in ['randomtbtxt','randomtxt','random_falcon_section'] :
        
        for idelttmp,elt_tmp in tqdm(enumerate(obj)):
            #for each input claim
            if elt_tmp['label'] not in ['SUPPORTS','REFUTES']:
                #print('WEIRD')
                idstopop+=[idelttmp]
                continue
            if cntpos>cnttostop and elt_tmp['label']=='SUPPORTS':
                idstopop+=[idelttmp]
                continue
            if cntneg>cnttostop and elt_tmp['label']=='REFUTES':
                idstopop+=[idelttmp]
                continue
            
            
            new_evidences=[]
            try:
                #we define the number of evidence to select,  in table, text or both
                if randomize_nbevidence=='gold':
                    nb_to_use_sentences=len([x for x in elt_tmp['evidence'][0]['content'] if '_sentence_' in x])
                    if randomize_strategy in ['randomtbtxt']:
                        nb_to_use_evidencecells=len([x for x in elt_tmp['evidence'][0]['content'] if '_cell_' in x and not 'header_cell_' in x])
                elif type(randomize_nbevidence)==dict:
                    nb_to_use_sentences=random.choice(randomize_nbevidence['sentencesnb'])
                    if randomize_strategy in ['randomtbtxt']:
                        nb_to_use_evidencecells=random.choice(randomize_nbevidence['cellsnb'])
                    if randomize_strategy in ['randomtbtxt']:
                        table_nb=-1
                        title=''
                        for evcell in elt_tmp['evidence'][0]['content']:
                            if len(title)>0 and table_nb>-1:
                                break
                            title=get_title(evcell)
                            if '_cell' in evcell:
                                table_nb=get_tablenb(evcell)
                        dico_all_cells=get_cells_and_headers(title, table_nb)
                    else:
                        dico_all_cells=dict()

                    #We select cells/text randomly using the right number
                    dico_all_sentences=get_sentences(title)
                    dico_all=dico_all_sentences|dico_all_cells
                    nb_to_use_sentences=min(nb_to_use_sentences,len(list(dico_all_sentences.keys())))
                    new_sentences_ids=random.sample(list(dico_all_sentences.keys()),nb_to_use_sentences)
                    if randomize_strategy in ['randomtbtxt']:
                        nb_to_use_evidencecells=min(nb_to_use_evidencecells,len(list(dico_all_cells.keys())))
                        new_cells_ids=random.sample(list(dico_all_cells.keys()),nb_to_use_evidencecells)
                    else:
                        new_cells_ids=[]
                    obj[idelttmp]['evidence'][0]={'content':new_sentences_ids+new_cells_ids,'context':{x:dico_all[x] for x in (new_sentences_ids+new_cells_ids)}}
                    obj[idelttmp]['nb_to_use_sentences']=nb_to_use_sentences
                    last_modified=idelttmp
                    if elt_tmp['label']=='SUPPORTS':
                        cntpos+=1
                    if elt_tmp['label']=='REFUTES':
                        cntneg+=1
            except Exception as e :
                idstopop+=[idelttmp]
                
    elif randomize_strategy in ['random_similarembedsentences']:
        #In case the selection of evidence has to be done using similar embeddings of sentences
        for idelttmp,elt_tmp in tqdm(enumerate(obj)):

            if elt_tmp['label'] not in ['SUPPORTS','REFUTES']:
                idstopop+=[idelttmp]
                continue
            if cntpos>cnttostop and elt_tmp['label']=='SUPPORTS':
                idstopop+=[idelttmp]
                continue
            if cntneg>cnttostop and elt_tmp['label']=='REFUTES':
                idstopop+=[idelttmp]
                continue
            
            
            new_evidences=[]
            try:
                if randomize_nbevidence=='gold':
                    nb_to_use_sentences=len([x for x in elt_tmp['evidence'][0]['content'] if '_sentence_' in x])
                        
                    _,title,section,_=get_context_nit(elt_tmp)
                    dico_all_sentences=get_sentences(title)
                    dico_all=dico_all_sentences
                    
                   
                    
                    #we get the tree of sentences
                    treesec=get_tree_sec_sentences(title)
                    #We select one random sentence that will be our anchor to select other sentences with similar embeddings
                    randsent=random.choice(get_sentences_value_list(title))
                    #we get the best sentences for the chosen sentence
                    resfct=compare_embeddings(randsent,treesec)
                    obj[idelttmp]['treesecupdated']=resfct['treesecupdated']
                    obj[idelttmp]['best_sentences_using_embeds']=resfct['best_sentences_using_embeds']
                    obj[idelttmp]['best_sections_using_embeds']=resfct['best_sections_using_embeds']   
                    
                    new_sentences_ids=[x[0] for x in obj[idelttmp]['best_sentences_using_embeds'][:nb_to_use_sentences]]
                    new_sentences_ids=[title+'_'+x for x in new_sentences_ids]
                    obj[idelttmp]['evidence'][0]={'content':new_sentences_ids,'context':{x:dico_all[x] for x in (new_sentences_ids)}}
                    obj[idelttmp]['nb_to_use_sentences']=nb_to_use_sentences
                    last_modified=idelttmp
                    if elt_tmp['label']=='SUPPORTS':
                        cntpos+=1
                    if elt_tmp['label']=='REFUTES':
                        cntneg+=1
            except Exception as e :
                print('exception')
                print(e)
                idstopop+=[idelttmp]
                
                
            
                
    elif randomize_strategy in ['random_alignsentencestotablelin','random_alignsectiontotablelin']:
        #In this setting, we select sentences close to the claim generated from the selected evidence from the table
        
        for idelttmp,elt_tmp in tqdm(enumerate(obj)):

            if elt_tmp['label'] not in ['SUPPORTS','REFUTES']:
                #print('WEIRD')
                idstopop+=[idelttmp]
                continue
            if cntpos>cnttostop and elt_tmp['label']=='SUPPORTS':
                idstopop+=[idelttmp]
                continue
            if cntneg>cnttostop and elt_tmp['label']=='REFUTES':
                idstopop+=[idelttmp]
                continue
            
            
            new_evidences=[]
            try:
                if randomize_nbevidence=='gold':
                    nb_to_use_sentences=len([x for x in elt_tmp['evidence'][0]['content'] if '_sentence_' in x])
                    nb_to_use_evidencecells=len([x for x in elt_tmp['evidence'][0]['content'] if '_cell_' in x and not 'header_cell_' in x])
                    table_nb=-1
                    title=''
                    for evcell in elt_tmp['evidence'][0]['content']:
                        if len(title)>0 and table_nb>-1:
                            break
                        title=get_title(evcell)
                        if '_cell' in evcell:
                            table_nb=get_tablenb(evcell)
                    dico_all_cells=get_cells_and_headers(title, table_nb)
                    dico_all_sentences=get_sentences(title)
                    dico_all=dico_all_sentences|dico_all_cells
                    new_cells_ids=random.sample(list(dico_all_cells.keys()),nb_to_use_evidencecells)
                    cells_to_keep=[[int(u) for u in x.split('_')[-2:]] for x in new_cells_ids]
                
                    _,title,section,_=get_context_nit(elt_tmp)
                    obj[idelttmp]['original_table']=get_table(title, table_nb)
                    cols_to_keep=list(set([x[1] for x in cells_to_keep]))
                    rows_to_keep=list(set([x[0] for x in cells_to_keep]))
                    allrowsused=[]
                    allcolsused=[]
                    final_txt="<title>"+title+"</title>"+"<section>"+section+"</section>"+"<table>"
                    #We linerize the table for table 2 text t5 generation (and first embeddings comparison)
                    for row_id,row in enumerate(obj[idelttmp]['original_table']):
                        rowtxt=''
                        for col_id,col in enumerate(row):
                            if [row_id,col_id] in cells_to_keep:
                                row_headers=[x.replace('[H]','') for x in row[:col_id] if '[H]' in x][:3]
                                col_headers=[x[col_id].replace('[H]','') for x in obj[idelttmp]['original_table'][:row_id] if len(x)>col_id and '[H]' in x[col_id]][:3]
                                if len(row_headers)>0:
                                    row_headers='<row_header>'+'</row_header><row_header>'.join(row_headers)+'</row_header>'
                                else:
                                    row_headers=''
                                if len(col_headers)>0:
                                    col_headers='<col_header>'+'</col_header><col_header>'.join(col_headers)+'</col_header>'
                                else:
                                    col_headers=''
                                rowtxt+='<cell>'+col+row_headers+col_headers+'</cell>'
                            elif row_id in rows_to_keep and col_id in cols_to_keep:
                                rowtxt+='<cell>'+'</cell>'
                        if not rowtxt=='':
                            final_txt+='<row>'+rowtxt+'</row>'

                    final_txt+='</table>'


                    obj[idelttmp]['input_model_tb2txt']="table2text: "+final_txt
                    
                    
                    
                    
                    treesec=get_tree_sec_sentences(title)
                    resfct=compare_embeddings(obj[idelttmp]['input_model_tb2txt'],treesec)
                    obj[idelttmp]['treesecupdated']=resfct['treesecupdated']
                    obj[idelttmp]['best_sentences_using_embeds']=resfct['best_sentences_using_embeds']
                    obj[idelttmp]['best_sections_using_embeds']=resfct['best_sections_using_embeds']   
                    if randomize_strategy =='random_alignsentencestotablelin':
                        new_sentences_ids=[x[0] for x in obj[idelttmp]['best_sentences_using_embeds'][:nb_to_use_sentences]]
                    
                    new_sentences_ids=[title+'_'+x for x in new_sentences_ids]
                    obj[idelttmp]['evidence'][0]={'content':new_sentences_ids+new_cells_ids,'context':{x:dico_all[x] for x in (new_sentences_ids+new_cells_ids)}}
                    obj[idelttmp]['nb_to_use_sentences']=nb_to_use_sentences
                    last_modified=idelttmp
                    if elt_tmp['label']=='SUPPORTS':
                        cntpos+=1
                    if elt_tmp['label']=='REFUTES':
                        cntneg+=1
            except Exception as e :
                print('exception')
                print(e)
                idstopop+=[idelttmp]


18756it [00:00, 64280.84it/s]


In [21]:
#Intermediate save
if randomize_strategy in ['random_similarembedsentences']:
        f=open(base_path+'_savenew_alignedsentence_randomtype_finished_'+namecomp+'_'+randomize_strategy+('_reversesubset' if reversesubset else '')+'.json','w')
        json.dump(obj,f)
        f.close()

In [186]:
if randomize_strategy in ['random_alignsentencestotablelin','random_alignsectiontotablelin']:
        f=open(base_path+'_savenew_align_randomtype_finished_'+namecomp+'_'+randomize_strategy+('_reversesubset' if reversesubset else '')+'.json','w')
        json.dump(obj,f)
        f.close()

In [189]:
#Popping failed index
for idtopop in idstopop[::-1]:
    obj.pop(idtopop)
    

In [193]:
if randomize_strategy in ['random_alignsentencestotablelin','random_alignsectiontotablelin']:
        f=open(base_path+'_savenew_align_randomtype_finishedclear_'+namecomp+'_'+randomize_strategy+('_reversesubset' if reversesubset else '')+'.json','w')
        json.dump(obj,f)
        f.close()
    

In [195]:
if randomize_strategy in ['random_similarembedsentences']:
        f=open(base_path+'_savenew_alignedsentence_randomtype_finishedclear_'+namecomp+'_'+randomize_strategy+('_reversesubset' if reversesubset else '')+'.json','w')
        json.dump(obj,f)
        f.close()
    

In [113]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model     = model.to(device)

In [196]:
#Convert tables to text

stats={'ok':0,'titlepb':0,'tablenbpb':0,'error':0}

if randomize_strategy in ['random_alignsentencestotablelin','random_alignsectiontotablelin','randomtbtxt','random_falcon_section']:
    res=[]
    lenmaxtor=1500
    current_i=0
    for elt_tmp in tqdm(obj):
        current_i+=1
        if len([x for x in res if x['label']=='SUPPORTS'])>=lenmaxtor and len([x for x in res if x['label']=='REFUTES'])>=lenmaxtor:
            break
        if len([x for x in res if x['label']=='SUPPORTS'])>=lenmaxtor and elt_tmp['label']=='SUPPORTS':
            continue
        if len([x for x in res if x['label']=='REFUTES'])>=lenmaxtor and elt_tmp['label']=='REFUTES':
            continue
        if current_i%500==0:
            f=open(base_path+'_savenew_table2txt_'+str(current_i)+('_randomtype_'+randomize_strategy+'_' if randomize_from_fileseed else '')+('_reversesubset' if reversesubset else '')+'.json','w')
            json.dump(res,f)
            f.close()
        try:
            elt=copy.deepcopy(elt_tmp)
            _,title,section,_=get_context_nit(elt)
            cells_to_keep=[]
            table_nb=-1
            pb=False
            for evcell in elt['evidence'][0]['content']:
                titletmp=get_title(evcell)
                if not(titletmp==title) :
                        print(titletmp)
                        print(title)
                        print('gg')
                        stats['titlepb']+=1
                        pb=True
                        break
                if '_cell' in evcell:
                    cells_to_keep+=[get_pos(evcell)]
                    tablenbtmp=get_tablenb(evcell)
                    if not(tablenbtmp==table_nb) and table_nb >-1:
                        stats['tablenbpb']+=1
                        pb=True
                        break
                    table_nb=tablenbtmp
            if pb:
                continue
            else:
                stats['ok']+=1
            elt['original_table']=get_table(title, table_nb)
            cols_to_keep=list(set([x[1] for x in cells_to_keep]))
            rows_to_keep=list(set([x[0] for x in cells_to_keep]))
            allrowsused=[]
            allcolsused=[]
            final_txt="<title>"+title+"</title>"+"<section>"+section+"</section>"+"<table>"

            for row_id,row in enumerate(elt['original_table']):
                rowtxt=''
                for col_id,col in enumerate(row):
                    if [row_id,col_id] in cells_to_keep:
                        #header in row
                        row_headers=[x.replace('[H]','') for x in row[:col_id] if '[H]' in x][:3]
                        col_headers=[x[col_id].replace('[H]','') for x in elt['original_table'][:row_id] if len(x)>col_id and '[H]' in x[col_id]][:3]
                        if len(row_headers)>0:
                            row_headers='<row_header>'+'</row_header><row_header>'.join(row_headers)+'</row_header>'
                        else:
                            row_headers=''
                        if len(col_headers)>0:
                            col_headers='<col_header>'+'</col_header><col_header>'.join(col_headers)+'</col_header>'
                        else:
                            col_headers=''
                        rowtxt+='<cell>'+col+row_headers+col_headers+'</cell>'
                    elif row_id in rows_to_keep and col_id in cols_to_keep:
                        rowtxt+='<cell>'+'</cell>'
                if not rowtxt=='':
                    final_txt+='<row>'+rowtxt+'</row>'

            final_txt+='</table>'
            elt['input_model_tb2txt']="table2text: "+final_txt
            inputs = tokenizer([elt['input_model_tb2txt']], max_length=max_input_length, truncation=True, return_tensors="pt").to(device)
            output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
            decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
            predicted_claim = nltk.sent_tokenize(decoded_output.strip())[0]
            elt['generatedtxtfromtable']=predicted_claim
            elt['title']=title
            res+=[elt]
        except Exception as e:
            stats['error']+=1
            print('error')
            print(e)
else:
    res=obj


In [197]:

if  randomize_strategy in ['random_alignsentencestotablelin','random_alignsectiontotablelin','randomtbtxt','random_falcon_section']:
    f=open(base_path+'_savenew_table2txt_final_'+namecomp+'_'+('_randomtype_'+randomize_strategy+'_' if randomize_from_fileseed else '')+('_reversesubset' if reversesubset else '')+'.json','w')
    json.dump(res,f)
    f.close()
    

In [29]:
def get_evidence(evidence_id):
    if '_sentence_' in evidence_id:
        try:
            page_to_request=str(evidence_id.split('_')[0])
            page_json = db.get_doc_json(page_to_request)
            wiki_page = WikiPage(page_to_request, page_json)
            sentence_to_request=int(evidence_id.split('_')[-1])
            return str(wiki_page.get_sentences()[sentence_to_request])
        except Exception as e:
            print(e)
            print(traceback.print_exc())
            return 'ERROR'
    elif '_cell_' in evidence_id:
        try:
            evidence_id=evidence_id.replace('header_cell','cell')
            page_to_request=str(evidence_id.split('_')[0])
            table_to_request=int(evidence_id.split('_')[2])
            row_to_request=int(evidence_id.split('_')[3])
            col_to_request=int(evidence_id.split('_')[4])
            page_json = db.get_doc_json(page_to_request)
            wiki_page = WikiPage(page_to_request, page_json)

            wiki_tables = wiki_page.get_tables() #return list of all Wiki Tables

            wiki_table_0 = wiki_tables[table_to_request]
            wiki_table_0_rows = wiki_table_0.get_rows() #return list of WikiRows

            cells_row_0 = wiki_table_0_rows[row_to_request].get_row_cells()#return list with WikiCells for row 0
            return str(cells_row_0[col_to_request])
        except Exception as e:
            print(e)
            print(traceback.print_exc())
            return 'ERROR'
    

In [30]:
### We convert our evidence for the BartLarge dataset format in order to run claim generation

res2pos=[]
res2neg=[]
res_for_modelpos=[]
res_for_modelneg=[]
for elt_tmp_2 in tqdm(res):
    elt_tmp=copy.deepcopy(elt_tmp_2)
    ##Retrieve each evidence_sentence_txt
    ### The list of values for evidence start by the verbalized table if available
    evidence_list=[elt_tmp['generatedtxtfromtable']] if 'generatedtxtfromtable' in elt_tmp.keys() else []
    for elt_ev in elt_tmp['evidence'][0]['content']:
        
        if '_sentence_' in elt_ev:
            ##For each evidence in list we get the values
            evidence_list+=[get_evidence(elt_ev)]
            
    if 'ERROR' in evidence_list:
        continue
    if not 'title' in elt_tmp.keys():
        title= elt_tmp['evidence'][0]['content'][0].split('_')[0]
    else:
        title=elt_tmp['title']
        
    if elt_tmp['label']=='SUPPORTS':
        
        res_for_modelpos+=[{"evidences_txt":evidence_list, "claim":elt_tmp['claim'], "title":title}]
        res2pos+=[elt_tmp]
    elif elt_tmp['label']=='REFUTES':
        
        res_for_modelneg+=[{"evidences_txt":evidence_list, "claim":elt_tmp['claim'], "title":title}]
        res2neg+=[elt_tmp]


100%|██████████| 2998/2998 [00:04<00:00, 609.36it/s]


In [None]:
#The number of examples to generate
nb_ex_to_use=5000

In [31]:
res_for_modelpos=res_for_modelpos[:nb_ex_to_use]
res_for_modelneg=res_for_modelneg[:nb_ex_to_use]
res2pos=res2pos[:nb_ex_to_use]
res2neg=res2neg[:nb_ex_to_use]

In [34]:
#We convert our evidence to the huggingface format to use it for claim generation
dspos=Dataset.from_pandas(pd.DataFrame(data=res_for_modelpos))
dsneg=Dataset.from_pandas(pd.DataFrame(data=res_for_modelneg))

In [35]:
emptyds=Dataset.from_pandas(pd.DataFrame(data=[{"evidences_txt":'', "claim":'', "title":''}]))

In [36]:

datasetpos = DatasetDict({
    "train": dspos,
     "validation": emptyds,
     "test": emptyds,
})
datasetneg = DatasetDict({
    "train": dsneg,
     "validation": emptyds,
     "test": emptyds,
})
datasetpos.save_to_disk(base_path+'dstopredictfromrandomnewpos')
datasetneg.save_to_disk(base_path+'dstopredictfromrandomnewneg')


Saving the dataset (0/1 shards):   0%|          | 0/1499 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1499 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

In [37]:
def generate_file_name_from_params(params):
    #When we finetune our models or run inference on them,we give them a unique name based on parameters
    return 'ds_'+'_'.join([x+'_'+params[x].replace('/','Z').replace('_','T') for x in params.keys() if not(params[x]=='') and not(x in ['output','localmodel','outputdir'])])


def generate_hash_file_name_from_params(params):
    #As this name can be very long we hash it
    hash_object = hashlib.sha1(generate_file_name_from_params(params).encode('utf-8'))
    hex_dig = hash_object.hexdigest()
    return str(hex_dig)
    
    
def make_sh(params):
    #We create a script for running train/inference on our target model
    base_str=[\
        '#!/bin/bash', \
        '/home/bussotti/.conda/envs/feverous2/bin/python main.py \\' , \
        '--model_name_or_path '+ ('/data/' if params['localmodel'] else '')+ params['modeltouse'] +' \\' , \
        '--dataset_name_local '+ params['dsnamelocal'] +' \\' , \
        '--log_level error \\' , \
        '--only_train \\' , \
        '--output_dir \'/data/'+ params['outputdir'] +'\' \\' , \
        '--per_device_train_batch_size 16 \\' , \
        '--per_device_eval_batch_size 8 \\' , \
        '--gradient_accumulation_steps 1 \\' , \
        '--max_source_length 512 \\' , \
        '--min_target_length 30 \\' , \
        '--max_target_length 64 \\' , \
        '--generation_max_length 64 \\' , \
        '--num_train_epochs 20 \\' , \
        '--learning_rate 1e-4 \\' , \
        '--save_strategy epoch \\' , \
        '--evaluation_strategy epoch \\' , \
        '--fp16 \\' , \
        '--load_best_model_at_end \\' , \
        '--predict_with_generate \\' , \
        '--overwrite_output_dir \\' , \
        '--metric_for_best_model eval_rouge1 \\' , \
        '--save_total_limit 1 \\' , \
        '--num_beams 5 \\' , \
        '--generation_num_beams 2 \\' , \
        '--group_by_length \\' , \
        '--sortish_sampler \\' , \
        '--weight_decay 0.01 \\' , \
        '--label_smoothing_factor 0.1 \\' , \
        '--include_inputs_for_metrics \\' , \
        '--overwrite_cache \\' , \
        '--gradient_checkpointing \\' , \
        '--remove_unused_columns \\' , \
        '--max_eval_samples 200 \\' , \
        '--predict_samples_from_train 1499 \\'] ##############MODIFIED PREVIOUSLY 5000
    if 'pathtxt' in params.keys() and not params['pathtxt']=='':
        base_str+=['--from_path /data/'+params['pathtxt']+' \\']#feverous_random_posfromnegexs_feverfeverous100trained/predictions.txt
    if 'maxnb' in params.keys() and not params['maxnb']=='':
        base_str+=['--max_train_samples '+params['maxnb']+' \\']
        
    if 'change_subset' in params.keys() and not params['change_subset']=='':
        base_str+=['--change_subset \\']
    
    if params['step']=='train':
        base_str+=['--do_train']
    elif params['step']=='test':
        base_str+=['--do_predict']
    else:
        print('error 0')
    
    return base_str
    
    
def prepare_run_sh(sh_txt,folder_used):
    #From make_sh, we need to make the script runnable and run it
    f=open(base_path+'run_auto.sh','w')
    for elt in sh_txt:
        f.write(elt+'\n')
    f.close()
    outputs=[]
    errors=[]
    list_commands=[
        ["chmod a+wrx run_auto.sh",base_path],
        ["./run_auto.sh",base_path]
    ]
    for elt in list_commands:
        print('##########')
        print(elt)
        bashCommand = elt[0]
        cwd = elt[1]
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE,stderr=subprocess.STDOUT, cwd=cwd)
        output, error = process.communicate()
        outputs+=[output]
        errors+=[error]
        
        print(output)
        print(error)
    return outputs,error
    


In [38]:
def pipeline_training_claimgenerator(list_execs):
    #Function that run all training/inference for a list_execs object containing details about each train/inference

    outputs,errors=[],[]
    outposname,outnegname='',''
    total_time=0
    for nb_exec in range(len(list_execs)):
        if type(list_execs[nb_exec]['modeltouse'])==int:
            if list_execs[nb_exec]['modeltouse']<0:
                list_execs[nb_exec]['modeltouse']=nb_exec+list_execs[nb_exec]['modeltouse']
            list_execs[nb_exec]['modeltouse']=list_execs[list_execs[nb_exec]['modeltouse']]['outputdir']
            
            
        if type(list_execs[nb_exec]['pathtxt'])==int:
            if list_execs[nb_exec]['pathtxt']<0:
                list_execs[nb_exec]['pathtxt']=nb_exec+list_execs[nb_exec]['pathtxt']
            list_execs[nb_exec]['pathtxt']=list_execs[list_execs[nb_exec]['pathtxt']]['outputdir']+'/predictions.txt'
            
            
        
        list_execs[nb_exec]['outputdir']=generate_hash_file_name_from_params(list_execs[nb_exec])
        finalname=list_execs[nb_exec]['outputdir']
        oldfinalname=generate_file_name_from_params(list_execs[nb_exec])
        print(oldfinalname + ' hash : '+str(finalname))
        if 'output' in  list_execs[nb_exec].keys():
            print('èèèèèè output type:'+list_execs[nb_exec]['output'])
        if list_execs[nb_exec]['step']=='test' :
            if os.path.exists('/data/'+finalname+'/predictions.txt'):
                ####This model is already trained, ok
                print(finalname+' already done ')
            elif os.path.exists('/data/'+oldfinalname+'/predictions.txt'):
                ####This model is already trained, ok
                print(finalname+' already done, old input format '+oldfinalname)
                list_execs[nb_exec]['outputdir']=oldfinalname
                finalname=oldfinalname
            else :
                sh_txt=make_sh(list_execs[nb_exec])
                output,error=prepare_run_sh(sh_txt, finalname)
                outputs+=[list_execs[nb_exec],output]
                errors+=[list_execs[nb_exec],errors]
                if not os.path.exists('/data/'+finalname+'/predictions.txt'):
                    ####problem
                    print('error 3')
                    return -1
            f=open('/data/'+finalname+'/predict_results.json')
            obj_tmp=json.load(f)
            f.close()
            total_time+=obj_tmp['predict_runtime']
            if 'output' in  list_execs[nb_exec].keys() and list_execs[nb_exec]['output']=='pos':
                outposname='/data/'+finalname+'/predictions.txt'
            if 'output' in  list_execs[nb_exec].keys() and list_execs[nb_exec]['output']=='neg':
                outnegname='/data/'+finalname+'/predictions.txt'
            
        elif list_execs[nb_exec]['step']=='train':
            if os.path.exists('/data/'+finalname+'/pytorch_model.bin'):
                ####This model is already trained, ok
                print(finalname+' already done ')
            elif os.path.exists('/data/'+oldfinalname+'/pytorch_model.bin'):
                ####This model is already trained, ok
                print(finalname+' already done, old input format '+oldfinalname)
                list_execs[nb_exec]['outputdir']=oldfinalname
                finalname=oldfinalname
            else:
                
                sh_txt=make_sh(list_execs[nb_exec])
                output,error=prepare_run_sh(sh_txt, finalname )
                outputs+=[list_execs[nb_exec],output]
                errors+=[list_execs[nb_exec],errors]
                if not os.path.exists('/data/'+finalname+'/pytorch_model.bin'):
                    print('error 3')
                    return -1
            f=open('/data/'+finalname+'/all_results.json')
            obj_tmp=json.load(f)
            f.close()
            total_time+=obj_tmp['train_runtime']
            
        else:
            print('error 1')
            return -1
    print('list_execs')
    print(list_execs)
    print('outputs')
    print(outputs)
    print('errors')
    print(errors)
    return outposname,outnegname, total_time
    

In [210]:

dico_possibilities=[]

#Warm fever_ft_all True, nb_to_train_on_feverous_all 100
#Cold fever_ft_all False, nb_to_train_on_feverous_all 0

for fever_ft_all in [True]:#Cold set this to False, Warm set to True
    #True,False
    #Run a First training on  Fever
    for nb_to_train_on_feverous_all in ['100']:#0 =Cold, Warm set to 100
        # ['0','10','100','']:#''==all
        for positive_model in ['bartlarge']:
             #bartlarge or 'bartlarge-xsum'
            positive_model_real={'bartlarge':'facebook/bart-large','bartlarge-xsum':'facebook/bart-large-xsum'}[positive_model]
            for negative_model in ['bartbase-WikiFactEnglish']:
                #'bartbase-WikiFactEnglish','bartlarge+bartneg','bartlarge'
                negative_model_real={'bartlarge':'facebook/bart-large','bartbase-WikiFactEnglish':'minwhoo/bart-base-negative-claim-generation','bartlarge+bartneg':'minwhoo/bart-base-negative-claim-generation'}[negative_model]
                if negative_model in ['bartlarge','bartlarge+bartneg'] and not positive_model=='bartlarge':
                    continue
                if fever_ft_all==True and positive_model=='bartlarge-xsum':
                    continue
                if negative_model=='bartbase-WikiFactEnglish' and not positive_model=='bartlarge':
                    continue
                tmp=[]
                if fever_ft_all:
                    tmp+=[
                        {'modeltouse':positive_model_real,'dsnamelocal':'fever_traindev_10k_pos','pathtxt':'','maxnb':'','step':'train', 'localmodel':False},
                        {'modeltouse':negative_model_real,'dsnamelocal':'fever_traindev_10k_neg','pathtxt':'','maxnb':'','step':'train', 'localmodel':False},
                    ]
                if not nb_to_train_on_feverous_all=='0':
                    tmp+=[
                        {'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'feverous_pos','pathtxt':'','maxnb':nb_to_train_on_feverous_all,'step':'train', 'localmodel': fever_ft_all},
                        {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewpos','pathtxt':'','maxnb':'','step':'test','output':'pos','localmodel':True, 'change_subset':change_subset}]
                else:
                    tmp+=[
                        {'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'dstopredictfromrandomnewpos','pathtxt':'','maxnb':'','step':'test','output':'pos','localmodel':fever_ft_all, 'change_subset':change_subset}]
                
                if negative_model=='bartlarge+bartneg':
                    if not nb_to_train_on_feverous_all=='0':
                        tmp+=[{'modeltouse':-2,'dsnamelocal':'raw_dataset_random_neg','pathtxt':'','maxnb':'','step':'test','localmodel':True},
                            {'modeltouse':1 if fever_ft_all else negative_model_real,'pathtxt':'','dsnamelocal':'feverous_neg','maxnb':nb_to_train_on_feverous_all,'step':'train','localmodel':fever_ft_all},
                            {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewneg','maxnb':'','pathtxt':-2,'step':'test','output':'neg','localmodel':True, 'change_subset':change_subset} ]
                    else:
                        tmp+=[{'modeltouse':0 if fever_ft_all else positive_model_real,'dsnamelocal':'raw_dataset_random_neg','pathtxt':'','maxnb':'','step':'test','localmodel':fever_ft_all},
                            {'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','pathtxt':-1,'step':'test','output':'neg','localmodel':fever_ft_all, 'change_subset':change_subset} ]
                    
                else:
                    if not nb_to_train_on_feverous_all=='0':
                        tmp+=[{'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'feverous_neg','pathtxt':'','maxnb':nb_to_train_on_feverous_all,'step':'train','localmodel':fever_ft_all},
                            {'modeltouse':-1,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','step':'test','output':'neg','localmodel':True, 'change_subset':change_subset}]
                    else:
                        tmp+=[{'modeltouse':1 if fever_ft_all else negative_model_real,'dsnamelocal':'dstopredictfromrandomnewneg','pathtxt':'','maxnb':'','step':'test','output':'neg','localmodel':fever_ft_all, 'change_subset':change_subset}]
               
                tmp2=copy.deepcopy(tmp)
                dico_possibilities+=[{'feverftall':fever_ft_all,'nbtotrainonfeverousall':nb_to_train_on_feverous_all,'positivemodel':positive_model,'negativemodel':negative_model,'listexecs':tmp2}]
    


In [41]:
#We get the original claims
f=open(base_path+'/feverous_train_challenges.jsonl')
obj_o=[]
for line in f:
    obj_o+=[json.loads(line)]

obj_o_prclaim_positive=dict()
obj_o_prclaim_negative=dict()
obj_o_prclaim_global=dict()
for elt in obj_o:
    obj_o_prclaim_global[elt['id']]=elt
    if elt['label']=='SUPPORTS':
        obj_o_prclaim_positive[elt['id']]=elt
    elif elt['label']=='REFUTES':
        obj_o_prclaim_negative[elt['id']]=elt


In [45]:


for nb_exec, elt in enumerate(dico_possibilities):
    print(f'UUUUUUUUUUUUUUUUUUUUUUUUUUUU{nb_exec}')
    print('$$$$$$$$$$$$$$$$$$$')
    print(elt)
    print('$$$$$$$$$$$$$$$$$$$')
    # We get the path to prediction and total_time
    outposname,outnegname, total_time=pipeline_training_claimgenerator(elt['listexecs'])
    # We convert the predictions to the runnable file
    
    ##############
        
    print(elt)
    

    nc=outposname
    ncn=outnegname
    new_claims_pos=[]
    new_claims_neg=[]
    f_newclaims=open(nc)

    f_newclaims_neg=open(ncn)

    pos_same_seed_neg=False

    #we create the input file in the feverous format 
    if not os.path.exists(base_path+'/'+testablefolder):
        os.mkdir(base_path+'/'+testablefolder)
        
    for line in f_newclaims:
        new_claims_pos+=[line.replace('\n','')]

    for line in f_newclaims_neg:
        new_claims_neg+=[line.replace('\n','')]
    for dstom,claimstouse in [[res2pos,new_claims_pos],[res2neg,new_claims_neg]]:
        for nb,elt_c in enumerate(claimstouse):
            if 'claim' in claimstouse:
                dstom[nb]['original_claim']=dstom[nb]['claim']
            dstom[nb]['claim']=elt_c
    resf=res2pos+res2neg

    if elt['nbtotrainonfeverousall']=='':
        elt['nbtotrainonfeverousall']='all'
    #We create a parameters file containing information on the claims generated
    params_to_write=copy.deepcopy(elt)
    

    params_to_write['pos_len']=len([x for x in resf if x['label']=='SUPPORTS'])
    params_to_write['total_len']=params_to_write['pos_len'] if pos_same_seed_neg else len(resf)
    params_to_write['neg_len']=len([x for x in resf if x['label']=='REFUTES'])
    params_to_write['input_fileseed']=input_fileseed
    params_to_write['file_seed']=file_seed
    params_to_write['randomize_from_fileseed']=randomize_from_fileseed
    params_to_write['randomize_strategy']=randomize_strategy
    params_to_write['randomize_nbevidence']=randomize_nbevidence
    params_to_write['original_params']=original_params
    
    params_to_write['outposname']=outposname
    params_to_write['outnegname']=outnegname
    
    base_fn='ftfever_'+('1' if elt['feverftall'] else '0') + '_feveroustrainnb_'+str(elt['nbtotrainonfeverousall'])+'_'+elt['positivemodel']+'_'+str(params_to_write['pos_len'])+'_pos_'+elt['negativemodel']+'_'+str(params_to_write['neg_len'])
    if os.path.exists(base_path+'/'+testablefolder+'/'+base_fn+'_params.json'):
        print('file already done')
    else:
        params_to_write['total_time']=total_time
        [resf,supportnli,refutesnli]=nli_from_obj(resf)
        params_to_write['supportnli']=supportnli
        params_to_write['refutesnli']=refutesnli




        f4=open(base_path+'/'+testablefolder+'/'+base_fn+'_params.json','w')
        json.dump(params_to_write,f4)

        f4.close()


        f4=open(base_path+'/'+testablefolder+'/'+base_fn+'.jsonl','w')
        for eltb in resf:
            f4.write(json.dumps(eltb)+'\n')

        f4.close()
    
   
    

UUUUUUUUUUUUUUUUUUUUUUUUUUUU0
$$$$$$$$$$$$$$$$$$$
{'feverftall': False, 'nbtotrainonfeverousall': '0', 'positivemodel': 'bartlarge', 'negativemodel': 'bartlarge', 'listexecs': [{'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewpos', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'pos', 'localmodel': False, 'change_subset': '210923goldevtbtxt'}, {'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewneg', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'neg', 'localmodel': False, 'change_subset': '210923goldevtbtxt'}]}
$$$$$$$$$$$$$$$$$$$
ds_modeltouse_facebookZbart-large_dsnamelocal_dstopredictfromrandomnewpos_step_test_change_subset_210923goldevtbtxt hash : f48da717620c003472ed00c96f34a6d6a89cd0c8
èèèèèè output type:pos
##########
['chmod a+wrx run_auto.sh', '/homes/bussotti/feverous_work/feverousdata/']
b''
None
##########
['./run_auto.sh', '/homes/bussotti/feverous_work/feverousdata/']
None
ds_modeltouse_facebookZbart-l

None
list_execs
[{'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewpos', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'pos', 'localmodel': False, 'change_subset': '210923goldevtbtxt', 'outputdir': 'f48da717620c003472ed00c96f34a6d6a89cd0c8'}, {'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewneg', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'neg', 'localmodel': False, 'change_subset': '210923goldevtbtxt', 'outputdir': '6f730c7a19aa52c87af07606078aa81dc384c5ad'}]
outputs
errors
[{'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewpos', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'pos', 'localmodel': False, 'change_subset': '210923goldevtbtxt', 'outputdir': 'f48da717620c003472ed00c96f34a6d6a89cd0c8'}, [...], {'modeltouse': 'facebook/bart-large', 'dsnamelocal': 'dstopredictfromrandomnewneg', 'pathtxt': '', 'maxnb': '', 'step': 'test', 'output': 'neg', 'localmodel': False, 'change_

2998it [00:59, 50.03it/s]
