I have repeatedly experienced submission errors in the past few days, such as "Notebook Threw Exception", "Notebook Exceeded Allowed Compute" and "Notebook Timeout". Now, I modified the inference code for fast and safe submission(one deberta-large model takes only 17 minute), it works fine now. Hope it can helps.

Thanks for the following greate notebooks:

1. https://www.kaggle.com/hengck23/1-birdformer-1-longformer-one-fold
2. https://www.kaggle.com/abhishek/two-longformers-are-better-than-1

In [1]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [2]:
import sys
sys.path.append("../input/tez-lib/")

import gc
import numpy as np
import glob
import pandas as pd
from timeit import default_timer as timer
from joblib import Parallel, delayed

import torch
from torch.nn.parallel.data_parallel import data_parallel

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *
from transformers import AutoConfig, AutoModel, AutoTokenizer
import torch.nn as nn
import psutil
import torch.cuda.amp as amp
import os
import tez



is_amp   = True
is_cuda  = True
is_debug = False # False

max_length = 1536 # 1536
submit_dir = ''


#helper
def time_to_str(t, mode='min'):
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)
    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)
    else:
        raise NotImplementedError

In [3]:
#config 

discourse_marker_to_label = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
label_to_discourse_marker = {v: k for k, v in discourse_marker_to_label.items()}
num_discourse_marker = 15 #len(label_to_discourse_marker)-1 #15

length_threshold = {
    'Lead'                : 5, # 5
    'Position'            : 5,
    'Claim'               : 3,
    'Counterclaim'        : 6,
    'Rebuttal'            : 3, # 4
    'Evidence'            : 14,
    'Concluding Statement': 5, # 5
}
probability_threshold = {
    'Lead'                : 0.687,
    'Position'            : 0.537,
    'Claim'               : 0.537,
    'Counterclaim'        : 0.535,
    'Rebuttal'            : 0.537,
    'Evidence'            : 0.637,
    'Concluding Statement': 0.687,
}


if is_debug:
    text_dir = '../input/feedback-prize-2021/train'
    df = pd.read_csv('../input/feedbackfolds/train_folds.csv')
    valid_df = df[:10000]
    # valid_df = df[df['kfold'] == 0].reset_index(drop=True)
    valid_id = valid_df['id'].unique()

else:
    text_dir = '../input/feedback-prize-2021/test'
    valid_id = [ f.split('/')[-1][:-4] for f in glob.glob(text_dir+'/*.txt') ]
    valid_id = sorted(valid_id)
num_valid = len(valid_id)
print('len(valid_id)',len(valid_id))

df_text=[]
for id in valid_id:
    text_file = text_dir +'/%s.txt'%id
    with open(text_file, 'r') as f:
        text = f.read()

    text = text.replace(u'\xa0', u' ')
    text = text.rstrip()
    text = text.lstrip()
    df_text.append((id,text))
df_text = pd.DataFrame(df_text, columns=['id','text'])
df_text['text_len'] = df_text['text'].apply(lambda x: len(x))
df_text = df_text.sort_values('text_len').reset_index(drop=True)
del df_text['text_len']

print('df_text.shape',df_text.shape)
print(df_text)

len(valid_id) 5
df_text.shape (5, 2)
             id                                               text
0  D46BCB48440A  When people ask for advice,they sometimes talk...
1  D72CB1C11673  Making choices in life can be very difficult. ...
2  DF920E0A7337  Have you ever asked more than one person for h...
3  0FB0700DAF44  During a group project, have you ever asked a ...
4  18409261F5C2  80% of Americans believe seeking multiple opin...


In [4]:
def _prepare_test_data_helper(tokenizer, ids):
    test_samples = []
    for idx in ids:
        text_file = text_dir +'/%s.txt' % idx
        with open(text_file, 'r') as f:
            text = f.read()

        text = text.replace(u'\xa0', u' ')
        text = text.rstrip()
        text = text.lstrip()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples


class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_id = self.samples[idx]["id"]
        input_text = self.samples[idx]["text"]
        input_offset = self.samples[idx]["offset_mapping"]
        

        # add start token id to the input_ids
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "id":input_id,
            'text':input_text,
            "token_id": input_ids,
            "token_mask": attention_mask,
            "token_offset":str(input_offset),
        }

class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["id"] = [sample["id"] for sample in batch]
        output["token_offset"] = [sample["token_offset"] for sample in batch]
        output["text"] = [sample["text"] for sample in batch]
        
        output["token_id"] = [sample["token_id"] for sample in batch]
        output["token_mask"] = [sample["token_mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(token_id) for token_id in output["token_id"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["token_id"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["token_id"]]
            output["token_mask"] = [s + (batch_max - len(s)) * [0] for s in output["token_mask"]]
        else:
            output["token_id"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["token_id"]]
            output["token_mask"] = [(batch_max - len(s)) * [0] + s for s in output["token_mask"]]

        # convert to tensors
        output["token_id"] = torch.tensor(output["token_id"], dtype=torch.long)
        output["token_mask"] = torch.tensor(output["token_mask"], dtype=torch.long)

        return output

In [5]:
class FeedbackModel(tez.Model):
    def __init__(self, model_name, num_labels=num_discourse_marker):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}


checkpoint =[   
    [
     '../input/feedbackdeberta/model/model_0.bin',
     '../input/feedbackdeberta/model/model_1.bin',
     '../input/feedbackdeberta/model/model_2.bin',
     '../input/feedbackdeberta/model/model_3.bin',
     '../input/feedbackdeberta/model/model_4.bin',
    ],
    [
     '../input/mybirdsmodel/dfe1.bin',
     '../input/mybirdsmodel/dfe5.bin',
    ],
    [
    '../input/mybirdsmodel/dx0.bin',
    '../input/mybirdsmodel/dx1.bin',
    ]
]

net_type = [
    
    [FeedbackModel,   '../input/debertalarge/' ],
    [FeedbackModel,'../input/debertalarge/'],
    [FeedbackModel,   '../input/deberta-xlarge/' ],
]

num_net = sum([len(i) for i in checkpoint])
num_net1 = sum([len(i) for i in checkpoint[:1]])
num_net2 = sum([len(i) for i in checkpoint[:2]])

In [6]:
#processing

def text_to_word(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r==-1:
            raise NotImplementedError
        else:
            start = start+r
            end   = start+len(w)
            word_offset.append((start,end))
            #print('%32s'%w, '%5d'%start, '%5d'%r, text[start:end])
        start = end

    return word, word_offset

def word_probability_to_predict_df(text_to_word_probability, id):
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        if word_predict[t] not in [
            discourse_marker_to_label['O'],
            discourse_marker_to_label['PAD'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t== len_word-1: break
            continue

        t = t+1
        if t== len_word-1: break

        #----
        if   label_to_discourse_marker[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        elif label_to_discourse_marker[b_marker_label][0]=='I':
            i_marker_label = b_marker_label
        else:
            raise NotImplementedError

        while 1:
            #print(t)
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                # prediction_string = ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                discourse_type = label_to_discourse_marker[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                # predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                predict_df.append((id, start, end, discourse_type, str(discourse_score)))
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break
    
    temp_df = []
    for phrase_idx, (_, word_start, word_end, label, str_scores) in enumerate(predict_df):
        if label == 'Lead':
            word_end = min(word_end, int(len_word * 0.28))
            if word_start > int(len_word * 0.20):
                continue
        elif label == 'Concluding Statement':
            word_start = max(word_start, int(len_word * 0.67))
            if word_end < int(len_word * 0.90):
                continue
        if word_end < word_start:
            continue
        if label == 'Rebuttal' and len(temp_df) < 1:
            continue
        if label == 'Rebuttal':
            word_end = min(word_end, word_start + 45)
        if label == 'Counterclaim':
            word_end = min(word_end, word_start + 45)
        if label == 'Position':
            word_end = min(word_end, word_start + 40)
        if label == 'Claim':
            word_end = min(word_end, word_start + 31)
        ps = " ".join([str(x) for x in range(word_start, word_end)])
        temp_df.append((id, label, ps, str_scores)) 
    predict_df = pd.DataFrame(temp_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df

def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')

    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in length_threshold.items():
            #value=3
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    if 'probability' in use:
        df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
        for key, value in probability_threshold.items():
            index = df.loc[df['class'] == key].query('s<%f'%value).index
            df.drop(index, inplace=True)

    df = df[['id', 'class', 'predictionstring']]
    return df

#evaluation for debug ----
# https://www.kaggle.com/cpmpml/faster-metric-computation

def compute_overlap(predict, truth):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    # Length of each and intersection
    try:
        len_truth   = len(truth)
        len_predict = len(predict)
        intersect = len(truth & predict)
        overlap1 = intersect/ len_truth
        overlap2 = intersect/ len_predict
        return (overlap1, overlap2)
    except:  # at least one of the input is NaN
        return (0, 0)

def compute_f1_score_one(predict_df, truth_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    t_df = truth_df.loc[truth_df['discourse_type'] == discourse_type,   ['id', 'predictionstring']].reset_index(drop=True)
    p_df = predict_df.loc[predict_df['class'] == discourse_type,  ['id', 'predictionstring']].reset_index(drop=True)

    p_df.loc[:,'predict_id'] = p_df.index
    t_df.loc[:,'truth_id'] = t_df.index
    p_df.loc[:,'predictionstring'] = [set(p.split(' ')) for p in p_df['predictionstring']]
    t_df.loc[:,'predictionstring'] = [set(p.split(' ')) for p in t_df['predictionstring']]

    # Step 1. all ground truths and predictions for a given class are compared.
    joined = p_df.merge(t_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_p','_t')
                          )
    overlap = [compute_overlap(*predictionstring) for predictionstring in zip(joined.predictionstring_p, joined.predictionstring_t)]

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['potential_TP'] = [(o[0] >= 0.5 and o[1] >= 0.5) for o in overlap]
    joined['max_overlap' ] = [max(*o) for o in overlap]
    joined_tp = joined.query('potential_TP').reset_index(drop=True)
    tp_pred_ids = joined_tp\
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','truth_id'])['predict_id'].first()

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = set(joined['predict_id'].unique()) - set(tp_pred_ids)

    matched_gt_ids   = joined_tp['truth_id'].unique()
    unmatched_gt_ids = set(joined['truth_id'].unique()) -  set(matched_gt_ids)

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    f1 = TP / (TP + 0.5*(FP+FN))
    return f1

def compute_lb_f1_score(predict_df, truth_df):
    f1_score = {}
    for discourse_type in truth_df.discourse_type.unique():
        f1_score[discourse_type] = compute_f1_score_one(predict_df, truth_df, discourse_type)
    #f1 = np.mean([v for v in class_scores.values()])
    return f1_score

In [7]:
def jn(pst, start, end):
    pst_temp = pst[start: end]
    while -1 in pst_temp:
        pst_temp.remove(-1)
    return " ".join([str(x) for x in pst_temp])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26, 27, 1): # 26 27 1
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof
    

def link_others(oof, c):
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == c]
    neoof = oof[oof['class'] != c]
    retval = []
    for idv in idu:
        q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
        if len(q) == 0:
            continue
        pst = []
        for i,r in q.iterrows():
            pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
        start = 1
        end = 1
        for i in range(2, len(pst)):
            cur = pst[i]
            end = i
            if (cur == -1) and (pst[i+1] > pst[end-1] + 1):
                retval.append((idv, c, jn(pst, start, end)))
                start = i + 1
        v = (idv, c, jn(pst, start, end+1))
        retval.append(v)
    roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
    roof = roof.merge(neoof, how='outer')
    return roof


def is_in2(s, text):
    s_len = len(s)
    l = 0; r = s_len
    while r < len(text):
        if text[l: r] == s:
            return True
        l += 1; r += 1
    return False


def change_rebuttal(df):
    temp = df.reset_index(drop=True)
    for i in range(1, len(temp)):
        if temp.loc[i - 1, "class"] in ['Counterclaim'] and temp.loc[i, "class"] != 'Rebuttal':
            text_id = temp.loc[i, "id"] # predictionstring
            word_nums = [int(x) for x in temp.loc[i, "predictionstring"].split()]
            word_start = word_nums[0]
            with open('../input/feedback-prize-2021/test/' + str(text_id) + '.txt', 'r') as f: # test
                text = f.read()
            text = text.replace(u'\xa0', u' ')
            text = text.rstrip()
            text = text.lstrip()
            text = text.split()
            if is_in2('owever', ' '.join(text[word_start: word_start + 1])):
                print('Change!', temp.loc[i, "class"])
                temp.loc[i, "class"] = 'Rebuttal'
    return temp

In [8]:
def memory_used_to_str():
    pid = os.getpid()
    processs = psutil.Process(pid)
    memory_use = processs.memory_info()[0] / 2. ** 30
    return 'ram memory gb :' + str(np.round(memory_use, 2))
if 1:
    print('start', memory_used_to_str())
##############################################################

def run_submit():
    if is_debug: print("THIS IS DEBUG ####################################")
    results = []
    
    for net_type_num in range(3):
        Net, arch = net_type[net_type_num]
        net = Net(arch)
        if 'v3' in arch:
            from transformers.models.deberta_v2 import DebertaV2TokenizerFast
            tokenizer = DebertaV2TokenizerFast.from_pretrained(arch)
        else:
            tokenizer = AutoTokenizer.from_pretrained(arch)

        test_samples = prepare_test_data(df_text, tokenizer)
        collate = Collate(tokenizer=tokenizer)
        valid_dataset = FeedbackDataset(test_samples, max_length, tokenizer)
        valid_loader  = DataLoader(
            valid_dataset,
            sampler = SequentialSampler(valid_dataset),
            batch_size  = 2, # 8
            drop_last   = False,
            num_workers = 2, 
            pin_memory  = False,
            collate_fn = collate,
        )    
        ######### checkpoint
        for n in range(len(checkpoint[net_type_num])):
            net.load(checkpoint[net_type_num][n], weights_only=True)

            if is_cuda:
                net.cuda()
            print('load ok : [%d] %s'%(n, arch))
            print('              %s'%(checkpoint[net_type_num][n]))
            print('after allocate net %d'%n, memory_used_to_str())
            results_n = {
                'id':[],
                'token_mask':[],
                'token_offset':[],
                'probability':[],
            }

            T = 0
            start_timer = timer()
            for t, batch in enumerate(valid_loader):
                batch_size = len(batch['id'])
                token_mask = batch['token_mask']
                token_id   = batch['token_id']
                if is_cuda:
                    token_mask = token_mask.cuda()
                    token_id = token_id.cuda()

                net.eval()
                with torch.no_grad():
                    with amp.autocast(enabled=is_amp):

                        probability = data_parallel(net,(token_id, token_mask))
                        # probability = net[n](token_id, token_mask)
                        pp = (probability[0] * 255).byte().data.cpu().numpy()
                        if pp.shape[1] > max_length:
                            pp = pp[:, :max_length, :]
                        else:
                            pp = np.pad(pp,((0, 0), (0, max_length - pp.shape[1]), (0, 0)),'constant', constant_values=0) 
                        #probability = 1
                        #pp = np.random.randint(0,255,size=[len(batch['token_offset']), max_length, 15]).astype('int8')
                        results_n['probability'].append( pp )
                        if n == 0:
                            results_n['token_offset' ] += [eval(x) for x in batch['token_offset']]
                        T += batch_size

                print('\r\t%d/%d  %s'%(T, len(valid_dataset), time_to_str(timer() - start_timer,'sec')),end='',flush=True)

            #----------------------------
            if is_cuda: torch.cuda.empty_cache()
            print('')
            if n == 0:
                results.append({
                    'probability' : np.concatenate(results_n['probability']),
                    'token_offset': np.array(results_n['token_offset'], object)
                })
            else:
                 results.append({
                    'probability' : np.concatenate(results_n['probability']),
                })           

            del probability, pp, results_n
            gc.collect()
            print('after gc.collect()', memory_used_to_str())
            print()
        #------------------------------------------------------------------------
        del net, test_samples, Net, tokenizer
        gc.collect()
        print('after gc.collect()', memory_used_to_str())
        print()   
        ##############################################################

    
    ##### concat
    submit_df = []
    for i in range(num_valid):
        d  = df_text.iloc[i]
        id = d.id
        text = d.text
        word, word_offset = text_to_word(text)
        token_to_text_probability = np.full((len(text),num_discourse_marker),0, np.float32)
        for j in range(num_net):
            p = results[j]['probability'][i][1:]/255  
            if j < num_net1:
                for t,(start,end) in enumerate(results[0]['token_offset'][i]):
                    if t==max_length-1: break #assume max_length, else use token_mask to get length
                    token_to_text_probability[start:end]+=p[t] #**0.5     
            elif j < num_net2:
                for t,(start,end) in enumerate(results[num_net1]['token_offset'][i]):
                    if t==max_length-1: break #assume max_length, else use token_mask to get length
                    token_to_text_probability[start:end]+=p[t] #**0.5  
            else:
                for t,(start,end) in enumerate(results[num_net2]['token_offset'][i]):
                    if t==max_length-1: break #assume max_length, else use token_mask to get length
                    token_to_text_probability[start:end]+=p[t] #**0.5  
            
        token_to_text_probability = token_to_text_probability/num_net
        
        text_to_word_probability = np.full((len(word),num_discourse_marker),0, np.float32)
        for t,(start,end) in enumerate(word_offset):
            text_to_word_probability[t]=token_to_text_probability[start:end].mean(0)

        predict_df = word_probability_to_predict_df(text_to_word_probability, id)
        submit_df.append(predict_df)
    print('')

    #----------------------------------------
    submit_df = pd.concat(submit_df).reset_index(drop=True) 
    submit_df = do_threshold(submit_df, use=['length', 'probability'])
    
    submit_df = change_rebuttal(submit_df)
    submit_df = link_evidence(submit_df)
    for c in  ['Lead', 'Position', 'Concluding Statement', 'Counterclaim', 'Rebuttal']:
        submit_df = link_others(submit_df, c)
    
    submit_df.to_csv('submission.csv', index=False)
    print('----')
    print(submit_df.head())
    print('submission ok!----')
    if is_debug:
        f1_score = compute_lb_f1_score(submit_df, valid_df)
        print('f1 macro : %f\n' % np.mean([v for v in f1_score.values()]))
        for k,v in f1_score.items():
            print('%20s : %05f'%(k,v))
            
run_submit()

start ram memory gb :0.36


Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors


load ok : [0] ../input/debertalarge/
              ../input/feedbackdeberta/model/model_0.bin
after allocate net 0 ram memory gb :1.73
	5/5   0 min 01 sec
after gc.collect() ram memory gb :1.95

load ok : [1] ../input/debertalarge/
              ../input/feedbackdeberta/model/model_1.bin
after allocate net 1 ram memory gb :1.96
	5/5   0 min 01 sec
after gc.collect() ram memory gb :1.96

load ok : [2] ../input/debertalarge/
              ../input/feedbackdeberta/model/model_2.bin
after allocate net 2 ram memory gb :1.96
	5/5   0 min 01 sec
after gc.collect() ram memory gb :1.96

load ok : [3] ../input/debertalarge/
              ../input/feedbackdeberta/model/model_3.bin
after allocate net 3 ram memory gb :1.96
	5/5   0 min 01 sec
after gc.collect() ram memory gb :1.96

load ok : [4] ../input/debertalarge/
              ../input/feedbackdeberta/model/model_4.bin
after allocate net 4 ram memory gb :1.96
	5/5   0 min 01 sec
after gc.collect() ram memory gb :1.96

after gc.collect() ram me

Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors


load ok : [0] ../input/debertalarge/
              ../input/mybirdsmodel/dfe1.bin
after allocate net 0 ram memory gb :3.3
	5/5   0 min 01 sec
after gc.collect() ram memory gb :3.3

load ok : [1] ../input/debertalarge/
              ../input/mybirdsmodel/dfe5.bin
after allocate net 1 ram memory gb :3.3
	5/5   0 min 01 sec
after gc.collect() ram memory gb :3.3

after gc.collect() ram memory gb :3.3



Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors


load ok : [0] ../input/deberta-xlarge/
              ../input/mybirdsmodel/dx0.bin
after allocate net 0 ram memory gb :4.86
	5/5   0 min 02 sec
after gc.collect() ram memory gb :4.86

load ok : [1] ../input/deberta-xlarge/
              ../input/mybirdsmodel/dx1.bin
after allocate net 1 ram memory gb :4.65
	5/5   0 min 02 sec
after gc.collect() ram memory gb :4.65

after gc.collect() ram memory gb :4.65


----
             id                 class  \
0  D46BCB48440A  Concluding Statement   
1  D72CB1C11673  Concluding Statement   
2  DF920E0A7337  Concluding Statement   
3  18409261F5C2  Concluding Statement   
4  0FB0700DAF44  Concluding Statement   

                                    predictionstring  
0  306 307 308 309 310 311 312 313 314 315 316 31...  
1  364 365 366 367 368 369 370 371 372 373 374 37...  
2  620 621 622 623 624 625 626 627 628 629 630 63...  
3  989 990 991 992 993 994 995 996 997 998 999 10...  
4  560 561 562 563 564 565 566 567 568 569 570 57...  
submissio