## Installation

In [6]:
#!pip install -qq genomic-benchmarks

In [7]:
### PARAMETERS

# K in K-MERS
K = 6
# BENCHMARK DATASET
BENCHMARK = "human_nontata_promoters"
# NUMBER OF EPOCHS FOR LM TRAINING
LM_EPOCHS = 1
# FINE TUNING EPOCHS
CLS_EPOCHS = 4

In [8]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset(BENCHMARK, version=0)

Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /home/jovyan/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:00<00:00, 80.5MB/s]


PosixPath('/home/jovyan/.genomic_benchmarks/human_nontata_promoters')

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"armheb/DNA_bert_{K}")

In [10]:
from datasets import load_dataset

splitted_datasets = load_dataset("simecek/Human_DNA_v0")

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Using custom data configuration simecek--Human_DNA_v0-3127ba11a87ac1a1


Downloading and preparing dataset None/None (download: 1.25 GiB, generated: 2.73 GiB, post-processed: Unknown size, total: 3.98 GiB) to /home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-3127ba11a87ac1a1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-3127ba11a87ac1a1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from datasets import DatasetDict

tiny_datasets = DatasetDict({'train': splitted_datasets['train'].select(range(50000)),
                              'test': splitted_datasets['test'].select(range(5000))
                           })

splitted_datasets = tiny_datasets



## Training LM - Stride K

In [12]:
splitted_datasets

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 5000
    })
})

In [13]:
def kmers(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

kmers("ATGGAAAGAGGCACCATTCT")    

['ATGGAA', 'AGAGGC', 'ACCATT']

In [14]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 675, 2000, 393, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [15]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets

        

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


        

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [16]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 162850
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16285
    })
})

In [17]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)



Using amp half precision backend


In [19]:
model.save_pretrained("model_strideK")

trainer.evaluate()

Configuration saved in model_strideK/config.json
Model weights saved in model_strideK/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 16285
  Batch size = 32


COMET INFO: Couldn't find a Git repository in '/home/jovyan' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/simecek/huggingface/aba6d9e21add46208391ebc78cda0560

Automatic Comet.ml online logging enabled


{'eval_loss': 8.464581489562988,
 'eval_runtime': 47.0879,
 'eval_samples_per_second': 345.843,
 'eval_steps_per_second': 10.81}

## Training LM - Stride 1

In [20]:
def tokenize_function(s, k=K):
  seq_split = " ".join(kmers_stride1(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})

{'input_ids': [2, 675, 2686, 2540, 1956, 3713, 2551, 2000, 3889, 3254, 715, 2845, 3174, 393, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=4)
tokenized_datasets

        

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors


        

#1:   0%|          | 0/1250 [00:00<?, ?ex/s]

#2:   0%|          | 0/1250 [00:00<?, ?ex/s]

#0:   0%|          | 0/1250 [00:00<?, ?ex/s]

#3:   0%|          | 0/1250 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9997 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [22]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/50 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 976250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 97625
    })
})

In [23]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_datasets['train'],
    eval_dataset=chunked_datasets['test'],
)

trainer.train()

Using amp half precision backend
***** Running training *****
  Num examples = 976250
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 15254
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/simecek/huggingface/aba6d9e21add46208391ebc78cda0560
COMET INFO:   Metrics:
COMET INFO:     eval_loss               : 8.464581489562988
COMET INFO:     eval_runtime            : 47.0879
COMET INFO:     eval_samples_per_second : 345.843
COMET INFO:     eval_steps_per_second   : 10.81
COMET INFO:   Others:
COMET INFO:     Created from : transformers
COMET INFO:   Parameters:
COMET INFO:     args/_n_gpu                             : 1
COMET INFO:     args/_no_sync_in_g

Epoch,Training Loss,Validation Loss
1,0.8976,0.595562


***** Running Evaluation *****
  Num examples = 97625
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-15254
Configuration saved in ./model/checkpoint-15254/config.json
Model weights saved in ./model/checkpoint-15254/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-380000] due to args.save_total_limit
Deleting older checkpoint [model/checkpoint-390000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-15254 (score: 0.595561683177948).
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/simecek/huggingface/ce23c086086946bfb726124a6ef80b18
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     epoch [32]               : (0.03, 1.0)
COMET INFO:     eval_loss       

TrainOutput(global_step=15254, training_loss=4.995824088365078, metrics={'train_runtime': 10964.3805, 'train_samples_per_second': 89.038, 'train_steps_per_second': 1.391, 'total_flos': 1.2931977142656e+17, 'train_loss': 4.995824088365078, 'epoch': 1.0})

In [25]:
model.save_pretrained("model_stride1")

trainer.evaluate()

Configuration saved in model_stride1/config.json
Model weights saved in model_stride1/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 97625
  Batch size = 32


{'eval_loss': 0.5963960886001587,
 'eval_runtime': 277.867,
 'eval_samples_per_second': 351.337,
 'eval_steps_per_second': 10.98,
 'epoch': 1.0}

## Finetuning - stride K

In [26]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_strideK")

loading configuration file model_strideK/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_strideK/pytorch_model.bin
Some weights of the model checkpoint at model_strideK were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.pred

In [29]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['negative', 'positive']:
    for f in Path(f'/home/jovyan/.genomic_benchmarks/human_nontata_promoters/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[f.stem] = (dset, int(c == "positive"), txt)

In [30]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
#df.to_pickle("human_nontata_promoters.pkl")
df

Unnamed: 0,dset,cat,seq
13497,train,0,CCATTGTAAGCAAAATGGATTATGAAAATTAATTTTACACAGGAAA...
4964,train,0,TTCACATCAATTGCTGCTTCAGGGATCACAGATTTTAGGGGCTCAT...
14233,train,0,GGAGTCAGAATATCCTGTTCCTCAACAGACTCTTTTACCTAGTGGT...
3266,train,0,GCTACTTGGTGAACTCTGGCATTGTTCCCATCTCGAGAAGTCTCAT...
8629,train,0,GAGCCTTCATTCTTGGTCAAGCTTTAGGCACATCTGAGTGAGTAGT...
...,...,...,...
FP003064,test,1,GAGGGTGATTAAGGTGAAGCGTTTCTTCCATTACTGGTAGAACGAA...
FP014906,test,1,CTCGATATTTTTTTTCTGAGATCTTCCCCTTGTTGTTAATACTAAT...
FP015371,test,1,TCTGTCCCCCTCCCCCTCCCTGTCCTTATCTTTCCCTTCTGTCTCC...
FP011060,test,1,GACGCCCCTTTGCTCCAGGCCCCGCCCCGGGGTCCCGCCTCCACCA...


In [31]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [32]:
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [33]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [34]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

PyTorch: setting up devices


In [35]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using amp half precision backend


In [39]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2389,0.388517,0.850232,0.857143
2,0.1764,0.44178,0.844809,0.855553
3,0.1638,0.461455,0.842595,0.854809
4,0.1645,0.525967,0.826987,0.846328


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model

In [40]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'eval_loss': 0.5259673595428467,
 'eval_accuracy': 0.8269869382333407,
 'eval_f1': 0.8463277947104512,
 'eval_runtime': 2.9488,
 'eval_samples_per_second': 3063.606,
 'eval_steps_per_second': 48.155,
 'epoch': 4.0}

## Finetuning - stride 1

In [41]:
model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.pred

In [42]:
def tok_func_stride1(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func_stride1, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [43]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [44]:
trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

Using amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4518,0.332948,0.845583,0.866392
2,0.2551,0.27318,0.887425,0.895789
3,0.1544,0.286161,0.89462,0.903311
4,0.106,0.343791,0.898162,0.90593


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model

In [45]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64


{'eval_loss': 0.34379133582115173,
 'eval_accuracy': 0.8981624972326766,
 'eval_f1': 0.9059304703476483,
 'eval_runtime': 12.3272,
 'eval_samples_per_second': 732.851,
 'eval_steps_per_second': 11.519,
 'epoch': 4.0}