## Installation

In [2]:
#!pip install -qq transformers genomic-benchmarks datasets

In [3]:
### PARAMETERS

# K in K-MERS
K = 4
# BENCHMARK DATASET
BENCHMARK = "human_nontata_promoters"
# NUMBER OF EPOCHS FOR LM TRAINING
LM_EPOCHS = 1
# FINE TUNING EPOCHS
CLS_EPOCHS = 4

In [4]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset(BENCHMARK, version=0)

  from tqdm.autonotebook import tqdm
Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /root/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:00<00:00, 133MB/s]


PosixPath('/root/.genomic_benchmarks/human_nontata_promoters')

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"armheb/DNA_bert_{K}")

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset

splitted_datasets = load_dataset("simecek/Human_DNA_v0")

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Using custom data configuration simecek--Human_DNA_v0-d7be3fc44fadbb72


Downloading and preparing dataset None/None (download: 1.25 GiB, generated: 2.73 GiB, post-processed: Unknown size, total: 3.98 GiB) to /root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-d7be3fc44fadbb72/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-d7be3fc44fadbb72/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from datasets import DatasetDict

tiny_datasets = DatasetDict({'train': splitted_datasets['train'].select(range(50000)),
                              'test': splitted_datasets['test'].select(range(5000))
                           })

splitted_datasets = tiny_datasets



## Training LM - Stride K

In [8]:
splitted_datasets

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 5000
    })
})

In [9]:
def kmers(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

kmers("ATGGAAAGAGGCACCATTCT")    

['ATGG', 'AAAG', 'AGGC', 'ACCA', 'TTCT']

In [10]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 46, 236, 208, 29, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [11]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets



        

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


      

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2502 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [12]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 244300
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24430
    })
})

In [13]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 244300
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 3817


Epoch,Training Loss,Validation Loss
0,4.9067,4.905518


***** Running Evaluation *****
  Num examples = 24430
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-3817
Configuration saved in ./model/checkpoint-3817/config.json
Model weights saved in ./model/checkpoint-3817/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-3817 (score: 4.905517578125).


TrainOutput(global_step=3817, training_loss=4.92086175564293, metrics={'train_runtime': 2263.5574, 'train_samples_per_second': 107.927, 'train_steps_per_second': 1.686, 'total_flos': 3.2356932163928064e+16, 'train_loss': 4.92086175564293, 'epoch': 1.0})

In [15]:
model.save_pretrained("model_strideK")

trainer.evaluate()

Configuration saved in model_strideK/config.json
Model weights saved in model_strideK/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24430
  Batch size = 32


{'epoch': 1.0,
 'eval_loss': 4.907104969024658,
 'eval_runtime': 55.915,
 'eval_samples_per_second': 436.913,
 'eval_steps_per_second': 13.664}

## Training LM - Stride 1

In [16]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers_stride1(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 46, 172, 163, 126, 236, 164, 129, 247, 208, 49, 182, 203, 29, 102, 137, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets

      

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


     

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


 

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9999 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [18]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 976450
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 97645
    })
})

In [19]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 976450
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 15257


Epoch,Training Loss,Validation Loss
0,0.5964,0.394799


***** Running Evaluation *****
  Num examples = 97645
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-15257
Configuration saved in ./model/checkpoint-15257/config.json
Model weights saved in ./model/checkpoint-15257/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-15257 (score: 0.39479947090148926).


TrainOutput(global_step=15257, training_loss=3.127184149087112, metrics={'train_runtime': 9081.2051, 'train_samples_per_second': 107.524, 'train_steps_per_second': 1.68, 'total_flos': 1.2933448101258854e+17, 'train_loss': 3.127184149087112, 'epoch': 1.0})

In [21]:
model.save_pretrained("model_stride1")

trainer.evaluate()

Configuration saved in model_stride1/config.json
Model weights saved in model_stride1/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 97645
  Batch size = 32


{'epoch': 1.0,
 'eval_loss': 0.3943907916545868,
 'eval_runtime': 224.5712,
 'eval_samples_per_second': 434.806,
 'eval_steps_per_second': 13.59}

## Finetuning - stride K

In [22]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_strideK")

loading configuration file model_strideK/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 0,
  "vocab_size": 261
}

loading weights file model_strideK/pytorch_model.bin
Some weights of the model checkpoint at model_strideK were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.bias', 'c

In [23]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['negative', 'positive']:
    for f in Path(f'/root/.genomic_benchmarks/human_nontata_promoters/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[f.stem] = (dset, int(c == "positive"), txt)

In [24]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
#df.to_pickle("human_nontata_promoters.pkl")
df

Unnamed: 0,dset,cat,seq
2038,train,0,CAAAGGGATCGATAAGCAGAGACCCCATGCTTCAGATCAAGAGCCT...
15715,train,0,CTCCCTCCACACCAGTCTCTACACTGCTGCCACAGTGATCTTTCTA...
12370,train,0,CCCAGGCAGGGAGAGGCCAGGGAGCCAAGAGTTTGAACCCAGTGCC...
3384,train,0,TGGACTAAACAAACAACAATCTTTTTAGAGGCAATCCCCACTTTCA...
9182,train,0,TGGTAGGTTTTCAGAGATTTTTAATGAAAAATTAAAAAAATTCCAG...
...,...,...,...
FP006168,test,1,CTACCATTAGAGGGAGATCTCCGAGCGCACACGGGAGCTCTTTCCC...
FP000845,test,1,ACAAGTATGCTTTCGCTTTAGGTAGGGCATTTGAGAGCAAAATGTA...
FP006398,test,1,GGGACTGCCCAGGGGGTTCCGAGATTCCTTCTCCCCTCCTATCACC...
FP001982,test,1,AAAATGGGCAAAGTACAAGAATAAGCAAAGAGTGAATAAATACAAA...


In [25]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [26]:
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [27]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [28]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

PyTorch: setting up devices


In [29]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [30]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4616,0.391146,0.828205,0.845909
2,0.343,0.344519,0.855767,0.870593
3,0.2929,0.310443,0.872371,0.881122
4,0.2606,0.325326,0.871264,0.880632


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-1500
Configuration saved in outputs/checkpoint-1500/config.json
Model weights saved in outputs/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving 

In [31]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'epoch': 4.0,
 'eval_accuracy': 0.8712641133495683,
 'eval_f1': 0.880632248794006,
 'eval_loss': 0.3253256678581238,
 'eval_runtime': 4.5376,
 'eval_samples_per_second': 1990.923,
 'eval_steps_per_second': 31.294}

## Finetuning - stride 1

In [32]:
model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 0,
  "vocab_size": 261
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.bias', 'c

In [33]:
def tok_func_stride1(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func_stride1, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [34]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [35]:
trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4774,0.327502,0.871264,0.879619
2,0.3158,0.301979,0.875692,0.886874
3,0.2688,0.282856,0.886761,0.893338
4,0.2467,0.286603,0.886761,0.893648


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model

In [36]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'epoch': 4.0,
 'eval_accuracy': 0.8867611246402479,
 'eval_f1': 0.893647988356378,
 'eval_loss': 0.28660261631011963,
 'eval_runtime': 12.7456,
 'eval_samples_per_second': 708.795,
 'eval_steps_per_second': 11.141}