## Installation

In [None]:
#!pip install -qq transformers genomic-benchmarks datasets

In [8]:
### PARAMETERS

# K in K-MERS
K = 7
# BENCHMARK DATASET
BENCHMARK = "human_nontata_promoters"
# NUMBER OF EPOCHS FOR LM TRAINING
LM_EPOCHS = 1
# FINE TUNING EPOCHS
CLS_EPOCHS = 4

In [None]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset(BENCHMARK, version=0)

  from tqdm.autonotebook import tqdm
Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /root/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:00<00:00, 315MB/s]


PosixPath('/root/.genomic_benchmarks/human_nontata_promoters')

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"armheb/DNA_bert_6")

In [9]:
from itertools import product

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

16384

In [None]:
from datasets import load_dataset

splitted_datasets = load_dataset("simecek/Human_DNA_v0")

Using custom data configuration simecek--Human_DNA_v0-d7be3fc44fadbb72
Reusing dataset parquet (/root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-d7be3fc44fadbb72/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from datasets import DatasetDict

tiny_datasets = DatasetDict({'train': splitted_datasets['train'].select(range(50000)),
                              'test': splitted_datasets['test'].select(range(5000))
                           })

splitted_datasets = tiny_datasets



## Training LM - Stride K

In [None]:
splitted_datasets

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 5000
    })
})

In [11]:
def kmers(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

kmers("ATGGAAAGAGGCACCATTCT")    

['ATGGAAA', 'GAGGCAC']

In [None]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 5563, 19575, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets



      

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


 

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


      

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


  

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [None]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 139600
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 13960
    })
})

In [None]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 139600
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 2181


Epoch,Training Loss,Validation Loss
0,8.5229,8.526708


***** Running Evaluation *****
  Num examples = 13960
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-2181
Configuration saved in ./model/checkpoint-2181/config.json
Model weights saved in ./model/checkpoint-2181/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-2181 (score: 8.526707649230957).


TrainOutput(global_step=2181, training_loss=8.62954671335898, metrics={'train_runtime': 1492.4955, 'train_samples_per_second': 93.535, 'train_steps_per_second': 1.461, 'total_flos': 1.8497136606511104e+16, 'train_loss': 8.62954671335898, 'epoch': 1.0})

In [None]:
model.save_pretrained("model_strideK")

trainer.evaluate()

Configuration saved in model_strideK/config.json
Model weights saved in model_strideK/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 13960
  Batch size = 32


{'epoch': 1.0,
 'eval_loss': 8.523029327392578,
 'eval_runtime': 40.671,
 'eval_samples_per_second': 343.242,
 'eval_steps_per_second': 10.745}

## Training LM - Stride 1

In [None]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers_stride1(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 5563, 9952, 11124, 15809, 18166, 11212, 16161, 19575, 16846, 5929, 11415, 16973, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets

      

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

  

Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors


      

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

  

Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors


#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9996 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [18]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 976150
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 97615
    })
})

In [19]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 976150
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 15252


Epoch,Training Loss,Validation Loss
0,5.4934,4.964247


***** Running Evaluation *****
  Num examples = 97615
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-15252
Configuration saved in ./model/checkpoint-15252/config.json
Model weights saved in ./model/checkpoint-15252/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-15252 (score: 4.964247226715088).


TrainOutput(global_step=15252, training_loss=7.5020346892902365, metrics={'train_runtime': 10459.6471, 'train_samples_per_second': 93.325, 'train_steps_per_second': 1.458, 'total_flos': 1.2935274072558797e+17, 'train_loss': 7.5020346892902365, 'epoch': 1.0})

In [None]:
model.save_pretrained("model_stride1")

trainer.evaluate()

## Finetuning - stride K

In [1]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_strideK")

Some weights of the model checkpoint at model_strideK were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at model_strideK and a

In [2]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['negative', 'positive']:
    for f in Path(f'/root/.genomic_benchmarks/human_nontata_promoters/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[f.stem] = (dset, int(c == "positive"), txt)

In [3]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
#df.to_pickle("human_nontata_promoters.pkl")
df

Unnamed: 0,dset,cat,seq
13234,train,0,TTTGCATTTTTAGTAGAGATGAGGTTTCGCCATGTTGGCCAGGCTG...
6699,train,0,CAGGGCCTCACTGTGAGCTCAGCCCCTGAACAGGCTCTGCTTCCCA...
7079,train,0,GTTAAGTTCATGTCATAGGAAGGGGATAAGTAACAGGGTACAGTTT...
10286,train,0,TCTCTTTTCTCTACCCTTTTCCTTCCTTTTTCCTCCCTCTCCCCAT...
6475,train,0,TATTCCAGGCAGGAAATAGGTAGACATAGGTCACCAAGTGGCAGCC...
...,...,...,...
FP001510,test,1,AATTTTGAGAAAAAAATTAATAAGAAAATAAACCTAGAAAACCACT...
FP014705,test,1,TAAATGTTAGGTAAAAGTTAATCATAACACTGTACACTGTTATGCC...
FP004438,test,1,TCTTATATGTTTTAATTCAGTTCATATTTATAATTTATATAAGCAG...
FP007574,test,1,TAAGGCTCGCGTCCGGGCCAGGCGGGTACCCCTAGTACTCTCTCCC...


In [4]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [12]:
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [13]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [15]:
from transformers import TrainingArguments, Trainer

bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [16]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [18]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4858,0.386139,0.826433,0.839869
2,0.2899,0.355133,0.845694,0.854336
3,0.1433,0.510869,0.842041,0.851648
4,0.0786,0.640833,0.839717,0.85167


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model

In [19]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'epoch': 4.0,
 'eval_accuracy': 0.8397166260792561,
 'eval_f1': 0.8516697398074166,
 'eval_loss': 0.6408330202102661,
 'eval_runtime': 3.8537,
 'eval_samples_per_second': 2344.239,
 'eval_steps_per_second': 36.848}

## Finetuning - stride 1

In [20]:
model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 20485
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.pre

In [21]:
def tok_func_stride1(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func_stride1, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [22]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [23]:
trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4558,0.265131,0.894731,0.903638
2,0.1699,0.19479,0.932809,0.937066
3,0.0623,0.281807,0.921076,0.929691
4,0.0302,0.259884,0.938344,0.943006


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model

In [24]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'epoch': 4.0,
 'eval_accuracy': 0.9383440336506531,
 'eval_f1': 0.9430062416862786,
 'eval_loss': 0.25988417863845825,
 'eval_runtime': 13.079,
 'eval_samples_per_second': 690.728,
 'eval_steps_per_second': 10.857}