## Installation

In [1]:
#!pip install -qq transformers genomic-benchmarks datasets

In [2]:
### PARAMETERS

# K in K-MERS
K = 9
# BENCHMARK DATASET
BENCHMARK = "human_nontata_promoters"
# NUMBER OF EPOCHS FOR LM TRAINING
LM_EPOCHS = 1
# FINE TUNING EPOCHS
CLS_EPOCHS = 4

In [3]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset(BENCHMARK, version=0)

  from tqdm.autonotebook import tqdm
Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /root/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:00<00:00, 285MB/s]


PosixPath('/root/.genomic_benchmarks/human_nontata_promoters')

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"armheb/DNA_bert_6")

In [4]:
from itertools import product

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

262144

In [6]:
from datasets import load_dataset

splitted_datasets = load_dataset("simecek/Human_DNA_v0")

Using custom data configuration simecek--Human_DNA_v0-d7be3fc44fadbb72
Reusing dataset parquet (/root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-d7be3fc44fadbb72/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from datasets import DatasetDict

tiny_datasets = DatasetDict({'train': splitted_datasets['train'].select(range(50000)),
                              'test': splitted_datasets['test'].select(range(5000))
                           })

splitted_datasets = tiny_datasets



## Training LM - Stride K

In [8]:
splitted_datasets

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 5000
    })
})

In [5]:
def kmers(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

kmers("ATGGAAAGAGGCACCATTCT")    

['ATGGAAAGA', 'GGCACCATT']

In [10]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 27508, 33357, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [11]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets



       

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


       

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


 

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [12]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 108650
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10865
    })
})

In [13]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=4, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=16,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=4,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 108650
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 1697


Epoch,Training Loss,Validation Loss
0,11.0198,10.878701


***** Running Evaluation *****
  Num examples = 10865
  Batch size = 4
Saving model checkpoint to ./model/checkpoint-1697
Configuration saved in ./model/checkpoint-1697/config.json
Model weights saved in ./model/checkpoint-1697/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-1697 (score: 10.878701210021973).


TrainOutput(global_step=1697, training_loss=11.23101817430576, metrics={'train_runtime': 3726.3355, 'train_samples_per_second': 29.157, 'train_steps_per_second': 0.455, 'total_flos': 1.4474312122564608e+16, 'train_loss': 11.23101817430576, 'epoch': 1.0})

In [15]:
model.save_pretrained("model_strideK")

trainer.evaluate()

Configuration saved in model_strideK/config.json
Model weights saved in model_strideK/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10865
  Batch size = 4


{'epoch': 1.0,
 'eval_loss': 10.878080368041992,
 'eval_runtime': 130.2846,
 'eval_samples_per_second': 83.394,
 'eval_steps_per_second': 20.854}

## Training LM - Stride 1

In [16]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers_stride1(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 27508, 97729, 116470, 191436, 229153, 117879, 197070, 251689, 208023, 33357, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets

        

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors


      

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9994 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [18]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 975950
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 97595
    })
})

In [19]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=4, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=16,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=4,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 975950
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 15249


Epoch,Training Loss,Validation Loss
0,6.2635,6.060956


***** Running Evaluation *****
  Num examples = 97595
  Batch size = 4
Saving model checkpoint to ./model/checkpoint-15249
Configuration saved in ./model/checkpoint-15249/config.json
Model weights saved in ./model/checkpoint-15249/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-15249 (score: 6.06095552444458).


TrainOutput(global_step=15249, training_loss=8.652806342769916, metrics={'train_runtime': 33157.6888, 'train_samples_per_second': 29.434, 'train_steps_per_second': 0.46, 'total_flos': 1.3006410462992794e+17, 'train_loss': 8.652806342769916, 'epoch': 1.0})

In [21]:
model.save_pretrained("model_stride1")

trainer.evaluate()

Configuration saved in model_stride1/config.json
Model weights saved in model_stride1/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 97595
  Batch size = 4


{'epoch': 1.0,
 'eval_loss': 6.060589790344238,
 'eval_runtime': 1165.9281,
 'eval_samples_per_second': 83.706,
 'eval_steps_per_second': 20.927}

## Finetuning - stride K

In [20]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_strideK")

loading configuration file model_strideK/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 266245
}

loading weights file model_strideK/pytorch_model.bin
Some weights of the model checkpoint at model_strideK were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.weight

In [21]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['negative', 'positive']:
    for f in Path(f'/root/.genomic_benchmarks/human_nontata_promoters/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[f.stem] = (dset, int(c == "positive"), txt)

In [22]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
#df.to_pickle("human_nontata_promoters.pkl")
df

Unnamed: 0,dset,cat,seq
13234,train,0,TTTGCATTTTTAGTAGAGATGAGGTTTCGCCATGTTGGCCAGGCTG...
6699,train,0,CAGGGCCTCACTGTGAGCTCAGCCCCTGAACAGGCTCTGCTTCCCA...
7079,train,0,GTTAAGTTCATGTCATAGGAAGGGGATAAGTAACAGGGTACAGTTT...
10286,train,0,TCTCTTTTCTCTACCCTTTTCCTTCCTTTTTCCTCCCTCTCCCCAT...
6475,train,0,TATTCCAGGCAGGAAATAGGTAGACATAGGTCACCAAGTGGCAGCC...
...,...,...,...
FP001510,test,1,AATTTTGAGAAAAAAATTAATAAGAAAATAAACCTAGAAAACCACT...
FP014705,test,1,TAAATGTTAGGTAAAAGTTAATCATAACACTGTACACTGTTATGCC...
FP004438,test,1,TCTTATATGTTTTAATTCAGTTCATATTTATAATTTATATAAGCAG...
FP007574,test,1,TAAGGCTCGCGTCCGGGCCAGGCGGGTACCCCTAGTACTCTCTCCC...


In [23]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [24]:
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [25]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [26]:
from transformers import Trainer, TrainingArguments

bs = 4
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none', save_steps=27000)

PyTorch: setting up devices


In [27]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [28]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 27100


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5161,0.531542,0.804295,0.802811
2,0.6701,0.693113,0.557118,0.709841
3,0.688,0.692121,0.544056,0.70471
4,0.6854,0.688698,0.544056,0.70471


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 

In [29]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


{'epoch': 4.0,
 'eval_accuracy': 0.5440557892406465,
 'eval_f1': 0.7047100150548427,
 'eval_loss': 0.6886978149414062,
 'eval_runtime': 17.324,
 'eval_samples_per_second': 521.473,
 'eval_steps_per_second': 65.227}

## Finetuning - stride 1

In [8]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at model_stride1 and a

In [16]:
def tok_func_stride1(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func_stride1, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})



  0%|          | 0/36131 [00:00<?, ?ex/s]

In [17]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [18]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 27100


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4243,0.400327,0.905911,0.915908
2,0.4381,2.30821,0.552579,0.708622
3,0.2886,0.573013,0.896281,0.899647
4,0.275,0.61288,0.895395,0.898091


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 

In [19]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


{'epoch': 4.0,
 'eval_accuracy': 0.8953951737879123,
 'eval_f1': 0.8980912326108056,
 'eval_loss': 0.6128804683685303,
 'eval_runtime': 22.0653,
 'eval_samples_per_second': 409.421,
 'eval_steps_per_second': 51.212}