<a href="https://colab.research.google.com/github/ML-Bioinfo-CEITEC/cDNA-pretraining/blob/main/experiments/genomic_benchmarks/Metrics_on_genomic_benchmarks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install -qq transformers genomic-benchmarks datasets

In [13]:
### Parameters
MODEL_NAME = "simecek/DNADeberta2"
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 6
STRIDE = K

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 0.01

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)
DATASETS = [("human_nontata_promoters", 0), ('demo_human_or_worm', 0)]
#from genomic_benchmarks.data_check import list_datasets
#list_datasets()

BATCH_SIZE = 32
LEARNING_RATE = 8e-5
EPOCHS = 4
RUNS = 3

OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/experiment1.csv'

## Download benchmark datasets and tokenizer

In [3]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path

for dataset_name, dataset_version in DATASETS:
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version)

benchmark_root = Path(BENCHMARKS_FOLDER)

  from tqdm.autonotebook import tqdm
Downloading...
From: https://drive.google.com/uc?id=1JW0-eTB-rJXvFcglqBo3pFZi1kyIWC3X
To: /root/.genomic_benchmarks/demo_human_or_worm.zip
100%|██████████| 28.9M/28.9M [00:00<00:00, 267MB/s]


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [5]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 501, 835, 650, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [7]:
import pandas as pd
import numpy as np
from random import random
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
from tqdm.autonotebook import tqdm

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])
    model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    for _ in range(RUNS):

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2,
            num_train_epochs=EPOCHS, weight_decay=0.01, report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x])
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))




  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/simecek/DNADeberta2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/eaa84554db67bd0bd1237534157006a6e5a26745104c342ca1a8117c146aa517.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADeberta2",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1

  0%|          | 0/374 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 277
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.529649,0.773196,0.796296
2,No log,0.606618,0.731959,0.775862
3,No log,0.536046,0.752577,0.785714
4,No log,0.535658,0.752577,0.785714


***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 277
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.162058,0.597938,0.70229
2,No log,1.409281,0.680412,0.747967
3,No log,1.100343,0.752577,0.76
4,No log,1.10559,0.762887,0.780952


***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 277
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.29088,0.752577,0.75
2,No log,1.626929,0.731959,0.734694
3,No log,1.923033,0.742268,0.782609
4,No log,1.72897,0.731959,0.763636


***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64
***** Running Evaluation *****
  Num examples = 97
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADeberta2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/eaa84554db67bd0bd1237534157006a6e5a26745104c342ca1a8117c146aa517.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADeberta2",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "ma

  0%|          | 0/1005 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 751
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 96


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.306993,0.877953,0.878431
2,No log,0.289849,0.905512,0.907692
3,No log,0.297633,0.929134,0.932836
4,No log,0.328741,0.92126,0.924242


***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 751
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 96


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.491348,0.905512,0.911111
2,No log,0.559515,0.901575,0.911661
3,No log,0.521503,0.905512,0.904762
4,No log,0.527354,0.913386,0.913386


***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 751
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 96


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.931633,0.901575,0.90566
2,No log,0.687983,0.897638,0.905109
3,No log,0.659086,0.913386,0.913386
4,No log,0.675422,0.913386,0.913386


***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64
***** Running Evaluation *****
  Num examples = 254
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [8]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,human_nontata_promoters,0.773196,0.796296,4.2198
1,human_nontata_promoters,0.762887,0.780952,4.2155
2,human_nontata_promoters,0.752577,0.782609,4.3126
3,demo_human_or_worm,0.929134,0.932836,12.1721
4,demo_human_or_worm,0.913386,0.913386,8.8587
5,demo_human_or_worm,0.913386,0.913386,9.8484


In [9]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_human_or_worm,0.918635,0.005249,0.919869,0.006483,10.293067,0.981996
human_nontata_promoters,0.762887,0.005952,0.786619,0.004862,4.2493,0.031674


In [14]:
outputs_df.to_csv(OUTPUT_PATH)