## Installation

In [None]:
#!pip install -qq transformers genomic-benchmarks datasets
#!wget http://raw.githubusercontent.com/ML-Bioinfo-CEITEC/cDNA-pretraining/main/experiments/custom_masking/custom_collator.py

In [None]:
### PARAMETERS

# K in K-MERS
K = 6
# 1st BENCHMARK DATASET
BENCHMARKS = ["demo_human_or_worm", "demo_coding_vs_intergenomic_seqs", "human_nontata_promoters", "human_enhancers_cohn"]
# NUMBER OF EPOCHS FOR LM TRAINING
LM_EPOCHS = 1
# FINE TUNING EPOCHS
CLS_EPOCHS = 4

In [None]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded

for benchmark in BENCHMARKS:
    if not is_downloaded(benchmark):
        download_dataset(benchmark, version=0)

  from tqdm.autonotebook import tqdm


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"armheb/DNA_bert_{K}")

In [None]:
from datasets import load_dataset

full_datasets = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized_stride1")
full_datasets

Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized_stride1-904c556a2b3bf833
Reusing dataset parquet (/root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized_stride1-904c556a2b3bf833/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 572004
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5147942
    })
})

In [None]:
from datasets import DatasetDict

tokenized_datasets = DatasetDict({'train': full_datasets['train'].select(range(2_000_000)),
                              'test': full_datasets['test'].select(range(200_000))
                           })


In [None]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer
from custom_collator import SubsequentCollator

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
data_collator = SubsequentCollator(tokenizer=tokenizer, mlm=True, mlm_probability=0.15, mask_fully=True)
model = DebertaForMaskedLM(config=model_config)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="epoch",    
    save_strategy="epoch",
    overwrite_output_dir=True,      
    num_train_epochs=LM_EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1           # whether you don't have much space so you let only 5 model weights saved in the disk
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 2000000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 31250


Epoch,Training Loss,Validation Loss
1,7.518,7.512715


***** Running Evaluation *****
  Num examples = 200000
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-31250
Configuration saved in ./model/checkpoint-31250/config.json
Model weights saved in ./model/checkpoint-31250/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-31] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-31250 (score: 7.512714862823486).


TrainOutput(global_step=31250, training_loss=7.6691539453125, metrics={'train_runtime': 20752.5273, 'train_samples_per_second': 96.374, 'train_steps_per_second': 1.506, 'total_flos': 2.64931670016e+17, 'train_loss': 7.6691539453125, 'epoch': 1.0})

In [None]:
model.save_pretrained("model_stride1")

Configuration saved in model_stride1/config.json
Model weights saved in model_stride1/pytorch_model.bin


## Finetuning - stride 1 - human or worm?

In [None]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias

In [None]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['human', 'worm']:
    for f in Path(f'/root/.genomic_benchmarks/demo_human_or_worm/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[c + " " + f.stem] = (dset, int(c == "worm"), txt)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
df

Unnamed: 0,dset,cat,seq
human 20927,train,0,TGGCAGGTATCATTTCCTCCAACTCATAATTTCCCACAGCAAGTCA...
human 28093,train,0,CCCATGTAACGCTGGTCACAGAAGGGATAAGTTGGGGCTTGCTGCA...
human 35804,train,0,CGGACGCTTCTCTGCATGGTCTTACGTGTGTCAGAGGAGCCCTGGC...
human 22502,train,0,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
human 21606,train,0,TATTTTTTAGTAGAGACAGGGTTTCACCGTGTTAGCCAGGATGGTC...
...,...,...,...
worm 17041,test,1,TGTTCACTTATGTGATTGCTCTGACTTATGGACTGAAGCATGTCAA...
worm 7673,test,1,ACTTGCAGAAATAAGAACTTCTGGAGCAATCCAGCCATCGGTTCCT...
worm 29447,test,1,GTATCCTGCCTTCTTCTTTTTTCGAATGTTCTTTTTCTCCTTGTTT...
worm 12945,test,1,ATCTCGATGAATAACTTGGTGGTGGGCTGTGTCGTTTTGCATAGTT...


In [None]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [None]:
def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})



  0%|          | 0/100000 [00:00<?, ?ex/s]

In [None]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 75000
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [None]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, save_steps=100000, report_to='none')

PyTorch: setting up devices


In [None]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [None]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 75000
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9376


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1466,0.128152,0.95672,0.956982
2,0.1121,0.138691,0.95868,0.95801
3,0.0729,0.124256,0.95768,0.957006
4,0.0422,0.148993,0.95876,0.958676


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch si

## Finetuning - stride 1 - demo_coding_vs_intergenomic_seqs

> Indented block



In [None]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias

In [None]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['coding_seqs', 'intergenomic_seqs']:
    for f in Path(f'/root/.genomic_benchmarks/demo_coding_vs_intergenomic_seqs/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[c + " " + f.stem] = (dset, int(c == "coding_seqs"), txt)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
df

Unnamed: 0,dset,cat,seq
coding_seqs 20927,train,1,AATCGGGAGAAGGAGGAGACTACAAGGATAGGCCCAGGAGTAATGG...
coding_seqs 28093,train,1,TGCACCCTGGACATGGTCATGGCCGGGACGGAGACGACCTCGGCCA...
coding_seqs 35804,train,1,GACAAGGATGGAATACGCCATGAAGTCCCTTAGCCTTCTCTACCCC...
coding_seqs 22502,train,1,GAGGGAAGCCCTGCAGAGCACTGCCTACCCTGAAGTGCCAGACCTT...
coding_seqs 21606,train,1,GAGCCCGGTGCGTTCCAACCGCCGCCGAAACCGGTCATCGTGGACA...
...,...,...,...
intergenomic_seqs 17041,test,0,TCCCAACCCCCATTCTTTCTGTAACTTCAAGATGGTATAAAAATGT...
intergenomic_seqs 7673,test,0,GCCCACCTCTGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCAC...
intergenomic_seqs 29447,test,0,TTTCATCCTTTAAGATAGAAGAGCTTCCTTCTTCCCATACTCTGAA...
intergenomic_seqs 12945,test,0,CCAGCATCATCCTGATACCAAAGCCTGGCAGAGACACAACCAAAAA...


In [None]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [None]:
def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/100000 [00:00<?, ?ex/s]

In [None]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 75000
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [None]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, save_steps=100000, report_to='none')

PyTorch: setting up devices


In [None]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [None]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 75000
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9376


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2657,0.251251,0.90068,0.90273
2,0.2218,0.254842,0.90624,0.90633
3,0.1751,0.251632,0.90188,0.904579
4,0.1428,0.273268,0.905,0.90511


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch si

## Finetuning - stride 1 - human_enhancers_cohn

In [None]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias

In [None]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['positive', 'negative']:
    for f in Path(f'/root/.genomic_benchmarks/human_enhancers_cohn/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[c + " " + f.stem] = (dset, int(c == "positive"), txt)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
df

Unnamed: 0,dset,cat,seq
positive train_positive_7109,train,1,CTCGTCTGGTGGATTTGTACCTGAGAAGGGATGTTTCGGGAGGGAT...
positive train_positive_4915,train,1,CAGAGGGAAACACGAAGGCCAAAACAGCAAAGGCCAGGGAGGCAAG...
positive train_positive_2023,train,1,CTCCTTCTGTGGAAGTTCTGGGTCTCTTACGCATCCTCCAGTTCCC...
positive train_positive_7775,train,1,AAGTTAAAATAAATAAAAAATTAATGGATAAATTATTTTCCTGCGA...
positive train_positive_8539,train,1,TTCTTGCATAACTTTTAAATATTTACATGTACGGCGTGTGGGTCTC...
...,...,...,...
negative test_negative_3056,test,0,GTAACATTTTAATGTTATTTTCTTTATCGTACATTCACCAGTGAGT...
negative test_negative_1766,test,0,GTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTAATCCACCCA...
negative test_negative_1612,test,0,GCTGAGTCAGGATAATTGCTTGAACTCGGGAGGCAGAGGTTGCAAT...
negative test_negative_633,test,0,AAGACCCTCTAGTGACTATCCACCATGTTTAGAACAAAATCCAAAC...


In [None]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [None]:
def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/27791 [00:00<?, ?ex/s]

In [None]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20843
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6948
    })
})

In [None]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, save_steps=100000, report_to='none')

PyTorch: setting up devices


In [None]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [None]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20843
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2608


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.598,0.57334,0.679188,0.594801
2,0.5263,0.538844,0.728699,0.739461
3,0.4617,0.627567,0.716177,0.717073
4,0.3038,0.684559,0.717617,0.722411


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6948
  Batch size 

## Finetuning - stride 1 - human_nontata_promoters

In [None]:
from transformers import DebertaForSequenceClassification

model_cls = DebertaForSequenceClassification.from_pretrained("model_stride1")

loading configuration file model_stride1/config.json
Model config DebertaConfig {
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file model_stride1/pytorch_model.bin
Some weights of the model checkpoint at model_stride1 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias

In [None]:
from pathlib import Path

tmp_dict = {}

for dset in ['train', 'test']:
  for c in ['positive', 'negative']:
    for f in Path(f'/root/.genomic_benchmarks/human_nontata_promoters/{dset}/{c}/').glob('*.txt'):
      txt = f.read_text()
      tmp_dict[c + " " + f.stem] = (dset, int(c == "positive"), txt)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})
df

Unnamed: 0,dset,cat,seq
positive FP003308,train,1,CCCACGACACGACCCCATGCCGCCCGCAGGGCGCCCCGGGGCTCGC...
positive FP005480,train,1,CTTTCTTTAATCCAGTAGACAATGAGAAACGCTGATTTGGGTCTAT...
positive FP018903,train,1,AGCTTGCAGTGAGCCGAGATAGCGCCATTGCACTCCAGCCTGGGCG...
positive FP001193,train,1,TTGCTCAGGGAAACTTTTCGGTTCTGCGACGCATGCGTTCAGCCTC...
positive FP017880,train,1,CTCAGAGACCAAGGAAGGAAGGAAGACCTGAAGACTATCTGTTGGA...
...,...,...,...
negative 6621,test,0,CCTCTGTAGAACATTCATATTAGGTTGGTGCAAAAGTACTTTCAAT...
negative 10604,test,0,GGGCCCCCTATGCGCTATGGAGAGAGTTCCTCTTCTCCGTTCCCGC...
negative 15597,test,0,AAGCATGCAGAGAATTCCGGGGAAGGTCAAGAGCAACCAGAGAGTA...
negative 13493,test,0,TTCTCCTTCTCTCAATTTCTGAATCTTTGAATAGTATGTTGTTTGT...


In [None]:
from datasets import Dataset, DatasetDict, load_metric

ds = Dataset.from_pandas(df)

In [None]:
def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
tok_ds = tok_ds.rename_columns({'cat':'labels'})

  0%|          | 0/36131 [00:00<?, ?ex/s]

In [None]:
dds = DatasetDict({
    'train': tok_ds.filter(lambda x: x["dset"] == "train"),
    'test':  tok_ds.filter(lambda x: x["dset"] == "test")
})

dds

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27097
    })
    test: Dataset({
        features: ['dset', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9034
    })
})

In [None]:
bs = 32
epochs = CLS_EPOCHS
lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, save_steps=100000, report_to='none')

PyTorch: setting up devices


In [None]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

Using cuda_amp half precision backend


In [45]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27097
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3388


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4229,0.30417,0.868718,0.875551
2,0.2505,0.271022,0.887979,0.899303
3,0.1437,0.287203,0.904251,0.90793


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset. If dset are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9034
  Batch size 

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4229,0.30417,0.868718,0.875551
2,0.2505,0.271022,0.887979,0.899303
3,0.1437,0.287203,0.904251,0.90793
4,0.0851,0.384776,0.899269,0.90659




Training completed. Do not forget to share your model on huggingface.co/models =)


