In [1]:
# !pip install transformers datasets comet-ml --yes

In [2]:
import os

commet_key = "EpKIINrla6U4B4LJhd9Sv4i0b"

os.environ['COMET_API_KEY'] = commet_key

import comet_ml

# Commet Init
comet_ml.init(project_name="DNA_finetuning", api_key= commet_key)

import torch 
import datasets 
import numpy as np
from datasets import Dataset, DatasetDict, load_metric
from huggingface_hub import notebook_login
from pathlib import Path
import pandas as pd
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters
from genomic_benchmarks.loc2seq import download_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback


COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [3]:
from genomic_benchmarks.data_check import list_datasets

# config
epochs = 10
model_name = "DNADeberta_fine"
dataset_name = "human_nontata_promoters"


In [4]:
print(torch.cuda.get_device_name(0))

NVIDIA A40


In [5]:
# notebook_login()

In [6]:
download_dataset(dataset_name, version=0)

Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /home/jovyan/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:00<00:00, 55.6MB/s]


PosixPath('/home/jovyan/.genomic_benchmarks/human_nontata_promoters')

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

In [8]:
def kmers(s, k=6):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

In [9]:
tmp_dict = {}

for dset in ['train', 'test']:
    for c in ['negative', 'positive']:
        for f in Path(f'../.genomic_benchmarks/{dataset_name}/{dset}/{c}/').glob('*.txt'):
            txt = f.read_text()
            tmp_dict[f.stem] = (dset, int(c == "positive"), txt)

In [10]:

df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})


In [11]:
train_valid_split = df.query("dset == 'train'").shape[0] // 100 * 80
print(df.query("dset == 'train'").shape[0], train_valid_split)

27097 21600


In [12]:
train_df = df[df['dset']=='train'].iloc[:train_valid_split,:]
valid_df = df[df['dset']=='train'].iloc[train_valid_split:,:]
test_df = df[df['dset']=='test']

datasets = [train_df, valid_df, test_df]

In [13]:
print(datasets)

[           dset cat                                                seq
13497     train   0  CCATTGTAAGCAAAATGGATTATGAAAATTAATTTTACACAGGAAA...
4964      train   0  TTCACATCAATTGCTGCTTCAGGGATCACAGATTTTAGGGGCTCAT...
14233     train   0  GGAGTCAGAATATCCTGTTCCTCAACAGACTCTTTTACCTAGTGGT...
3266      train   0  GCTACTTGGTGAACTCTGGCATTGTTCCCATCTCGAGAAGTCTCAT...
8629      train   0  GAGCCTTCATTCTTGGTCAAGCTTTAGGCACATCTGAGTGAGTAGT...
...         ...  ..                                                ...
FP001814  train   1  TATTTTATTGTTTCTGTATTCTTGTATGGTTGACTTTGAGTGATTC...
FP013152  train   1  CAGTCGACTGCAGAGACGACCGCGGTAGGTTTTTCAACCCGGACTC...
FP008951  train   1  TCTTATTGACATGCGTCAACGCGAGCTTGCGCTCAATAGCTATTTG...
FP000238  train   1  CTCCAAGACGCGAGTCACCGGTACGAAGCCACAGCCATTTCGCTGC...
FP014700  train   1  TGATGGGGCCGGCGGGGCGGGGTGGGCGCTCCCGGAGGCGGTCCGG...

[21600 rows x 3 columns],            dset cat                                                seq
FP018433  train   1  GCCCGGCCCCAGCCGCCTGCGACTCGCT

In [14]:
datasets = [Dataset.from_pandas(x) for x in datasets]

In [15]:
print(datasets)

[Dataset({
    features: ['dset', 'cat', 'seq', '__index_level_0__'],
    num_rows: 21600
}), Dataset({
    features: ['dset', 'cat', 'seq', '__index_level_0__'],
    num_rows: 5497
}), Dataset({
    features: ['dset', 'cat', 'seq', '__index_level_0__'],
    num_rows: 9034
})]


In [16]:
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

datasets = [x.map(tok_func, batched=False).rename_columns({'cat':'labels'}) for x in datasets]



  0%|          | 0/21600 [00:00<?, ?ex/s]

  0%|          | 0/5497 [00:00<?, ?ex/s]

  0%|          | 0/9034 [00:00<?, ?ex/s]

In [17]:
dds = DatasetDict({
    'train': datasets[0],
    'validation': datasets[1],
    'test':  datasets[2],
})

## 1) Fine-tuning

In [18]:

model = AutoModelForSequenceClassification.from_pretrained("simecek/DNADeberta", num_labels=2)

training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=epochs,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    logging_steps=20,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=200,
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=3,           # whether you don't have much space so you let only 5 model weights saved in the disk
# There was an error with some recursion call for push_to_hub == True
    push_to_hub=False,
    # hub_model_id="DNADeberta_fine",
    # hub_strategy="every_save"
)

def compute_metrics(eval_preds):
    # metric = load_metric("accuracy", "f1")
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dds['train'],
    eval_dataset=dds['validation'],
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics,
#     early_stopping_patience - considers evaluation calls (for us, steps at the moment)
    callbacks=[EarlyStoppingCallback(early_stopping_patience = 5, early_stopping_threshold = 0.02)],
)

trainer.train()
model.push_to_hub(model_name + dataset_name)


Some weights of the model checkpoint at simecek/DNADeberta were not used when initializing DebertaForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at simecek/DNADeb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Accuracy,F1
20,0.4655,0.539456,0.679462,0.809142
40,0.4518,0.574409,0.697471,0.821777
60,0.4191,0.579892,0.689831,0.81645
80,0.4017,0.416626,0.73913,0.85
100,0.4227,0.51711,0.724213,0.840051
120,0.3982,0.55766,0.668546,0.801352
140,0.3904,0.68553,0.695652,0.820513
160,0.4129,0.475581,0.748408,0.856102
180,0.395,0.427466,0.758959,0.862964
200,0.3784,0.426198,0.771148,0.870789


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset, __index_level_0__, seq. If dset, __index_level_0__, seq are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5497
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset, __index_level_0__, seq. If dset, __index_level_0__, seq are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5497
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset, __index_level_0__, seq. If dset, __index_level_0__, seq are not expected by `De

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/davidcechak/DNADeberta_finehuman_nontata_promoters into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Configuration saved in DNADeberta_finehuman_nontata_promoters/config.json
Model weights saved in DNADeberta_finehuman_nontata_promoters/pytorch_model.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/340M [00:00<?, ?B/s]

To https://huggingface.co/davidcechak/DNADeberta_finehuman_nontata_promoters
   0d92a13..89d7670  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/davidcechak/DNADeberta_finehuman_nontata_promoters/commit/89d76703081c8e0fecb6a1999b96370c445e57e3'

In [19]:
eval_metrics = trainer.evaluate()
print(eval_metrics)

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset, __index_level_0__, seq. If dset, __index_level_0__, seq are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5497
  Batch size = 32


{'eval_loss': 0.4261977970600128, 'eval_accuracy': 0.7711478988539203, 'eval_f1': 0.8707888249794578, 'eval_runtime': 2.9975, 'eval_samples_per_second': 1833.892, 'eval_steps_per_second': 57.382, 'epoch': 0.89}


In [20]:
predictions = trainer.predict(dds['test'])
print(predictions.metrics)

The following columns in the test set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: dset, __index_level_0__, seq. If dset, __index_level_0__, seq are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 9034
  Batch size = 32


{'test_loss': 0.41631343960762024, 'test_accuracy': 0.8245516936019482, 'test_f1': 0.8277361156396044, 'test_runtime': 4.8137, 'test_samples_per_second': 1876.71, 'test_steps_per_second': 58.79}


In [21]:
metric = load_metric("f1", "accuracy")
test_f1 = metric.compute(predictions = np.argmax(predictions.predictions, axis=-1), references = dds['test']['labels'])
print(test_f1)

metric = load_metric("accuracy", "f1")
test_acc = metric.compute(predictions = np.argmax(predictions.predictions, axis=-1), references = dds['test']['labels'])
print(test_acc)

return test_f1, test_acc

{'f1': 0.8277361156396044}
{'accuracy': 0.8245516936019482}


SyntaxError: 'return' outside function (1854054083.py, line 9)