# GENA-LM Sequence classification example

## Install requirements

In [1]:
! pip install torch --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
! pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from tra

In [None]:
import torch
torch.cuda.is_available()

True

## Get pre-trained GENA-LM model
The classification head will be randomly initialized.

Table with available models:
https://drive.google.com/uc?export=view&id=1R2LF4POMcbMgla0J31ttrVHzqT624dlh

### Pre-trained GENA-LM for Masked Language Modeling

In [3]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t')
model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t', trust_remote_code=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/46.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading (…)ain/modeling_bert.py:   0%|          | 0.00/97.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/AIRI-Institute/gena-lm-bert-base-t2t:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/541M [00:00<?, ?B/s]

### Pre-trained GENA-LM to finetune on sequence classification task

#### with HuggingFace

In [4]:
gena_module_name = model.__class__.__module__
print(gena_module_name)

transformers_modules.AIRI-Institute.gena-lm-bert-base-t2t.21343b983208dd7bd3430f5a0d812ab6131faa7d.modeling_bert


In [5]:
import importlib
# available class names:
# - BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
# - BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
# - BertForQuestionAnswering
# check https://huggingface.co/docs/transformers/model_doc/bert
cls = getattr(importlib.import_module(gena_module_name), 'BertForSequenceClassification')
cls

transformers_modules.AIRI-Institute.gena-lm-bert-base-t2t.21343b983208dd7bd3430f5a0d812ab6131faa7d.modeling_bert.BertForSequenceClassification

In [6]:
model = cls.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t', num_labels=2)
model.classifier

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at AIRI-Institute/gena-lm-bert-base-t2t and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=2, bias=True)

#### cloning the GENA-LM repo




In [None]:
! git clone https://github.com/AIRI-Institute/GENA_LM.git
! cd GENA_LM/src/gena_lm

Cloning into 'GENA_LM'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 58 (delta 14), reused 42 (delta 6), pack-reused 0[K
Unpacking objects: 100% (58/58), 21.52 MiB | 8.07 MiB/s, done.


or just download `modeling_bert.py` from https://github.com/AIRI-Institute/GENA_LM/tree/main/src/gena_lm

In [None]:
! wget https://raw.githubusercontent.com/AIRI-Institute/GENA_LM/main/src/gena_lm/modeling_bert.py

--2023-07-06 07:35:15--  https://raw.githubusercontent.com/AIRI-Institute/GENA_LM/main/src/gena_lm/modeling_bert.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94428 (92K) [text/plain]
Saving to: ‘modeling_bert.py’


2023-07-06 07:35:16 (6.61 MB/s) - ‘modeling_bert.py’ saved [94428/94428]



In [None]:
from modeling_bert import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t', num_labels=2)
model = model.cuda()
model.classifier

Some weights of the model checkpoint at AIRI-Institute/gena-lm-bert-base-t2t were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at AIRI-Ins

Linear(in_features=768, out_features=2, bias=True)

## Get sequence classification dataset

In [19]:
import pandas as pd
import os
from datasets import Dataset

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [17]:
main_path = 'gdrive/MyDrive/papers/bio/gena-lm/'
fph = 'uterus_1_4000.csv'

In [20]:
df = pd.read_csv(os.path.join(main_path, fph))

In [25]:
df = df[['dna_seq', 'label']]
df['label'].value_counts()

0    6346
1    6333
Name: label, dtype: int64

In [26]:
df = df.rename(columns={"dna_seq": "sequence"})

In [31]:
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)

In [32]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sequence', 'label'],
        num_rows: 10143
    })
    test: Dataset({
        features: ['sequence', 'label'],
        num_rows: 2536
    })
})

In [36]:
dataset['train'][0]

{'sequence': 'AGGAGTGTATTTAAAGTTGATGTAGAAAGCGTGGGTAGCGTGTGAGTGTTTAGGAGAAGCGCATTGGCGTTGGGAGACCACTAGCTCCAAGTCAGTTCTTGCTCTCTCAGTATTACCGTGTGGCCTTGGGCATCTCACTCTTTTCAACAGTCCCTTTAGGCTTGATTCTCTACAAGCTTGAATTCAATTTTTCAAAAATTTAAAAATTTTGCCCAGGTTGATCTTAAACCCCTGGCCTCAAACCTCCCACCTTGGCCTCCCAAAGTGCTGGGATTATAGGTCTGAGCCACCATGTCTCACCTGAATTCAATTTTTAAAATTGGGCATAATAAAAGAGATCACTTGAGACGATTCTATATTGTTGTAGGATCATCACAGACATAGGTATTTGTTTAATGGCTGCCTCCTCAACCAGCCTGTAAGCTCTGTGAAGACAGGAGCTGTGCCTGTCTCTAAGAGCAGGTACTTGCTGGATACCTAGTTGTTTTTTGATGATCACAGACATAACCGTGTTAGTAGGGTTCACACTTAGGGGTTTCATGTTGAGTGGATGGACCCTGGAAAAGGACTGCTTGGCTCTCTCATTTGTGTTACCTACCTAGGGCCAAGTTCCTTAACCTTTTGTTGCCTTTGTTTCCTGATGGAAATAAAAGAGGAATAATGACAGTCATAACAGTAAGTCTATCTCATAGGATTGCTTTGAGGATTATGTAAAATGTTATATAAAGTCCTTGGAACAGAGCTATCCTTTGTGTATGATTATTGGTGGTGGTGCTGGTGCTGCAGTTATAATAGATGCTCTGACATCCTCTTCCTAGTGGCTTTTTACTTATTTTTTTTATTTTTTAAAGGGGCTGATGCTGTGATACATGAAATGCTTGTCTTTCCTTCATGGCAGAACAAACTGAATCCTTGGCTCAACTTCATTCCTTGTGTCATAACAAGGCTCCTTCACACTCAGTCTCTTGGGAACCAACTTTCAGCCA

In [35]:
print('# base pairs: ', len(dataset['train'][0]['sequence']))

# base pairs:  3999


In [33]:
print('tokens: ', ' '.join(tokenizer.tokenize(dataset['train'][0]['sequence'])))

tokens:  AGGAGTG TATTTAA AGTTG ATGTAG AAAGCG TGGG TAGCG TGTGAGTG TTTAGG AGAAGCGC ATTGGCG TTGGG AGACCAC TAGC TCCAAG TCAG TTCTTGC TCTC TCAGTATT ACCGTG TGGCC TTGGGC ATCTCAC TCTTTTC AAC AGTCCC TTTAGGC TTGATTC TCTAC AAGCTTG AATTC AATTTT TCAAAA ATTTAAAA ATTTTGCCC AGGTTG ATCTTAA ACCCC TGGCC TCAA ACCTCCC ACC TTGGCCTCCCAAAGTGC TGGGATTATAGG TCTGAGCC ACCATG TCTCACC TGAATTC AATTTTTAAAA TTGGGC ATAATAAAAG AGATCACTTG AGACG ATTC TATATTG TTGTAGG ATCATCAC AGAC ATAGGTATT TGTTTAA TGGCTGCC TCCTC AACCAGCC TGTAAGC TCTGTG AAGACAGG AGCTGTGCC TGTCTC TAAG AGCAGG TAC TTGCTGG ATACC TAGTTG TTTTTTG ATGATCAC AGAC ATAACC GTG TTAG TAGGG TTCACAC TTAGGGG TTTC ATGTTG AGTGG ATGG ACCCTGG AAAAGG ACTGC TTGGCTCTC TCATTTGTG TTACC TACC TAGGGCC AAGTTCC TTAACC TTTTGTTGCC TTTGTTTCC TGATGG AAATAAAAG AGGAATAA TGACAGTC ATAAC AGTAAG TCTATC TCATAGG ATTGCTTTG AGGATT ATGTAA AATG TTATATAA AGTCC TTGGAAC AGAGC TATCC TTTGTG TATG ATTATTGG TGGTGG TGCTGG TGCTGC AGTT ATAATAG ATGCTCTG ACATCC TCTTCC TAGTGGC TTTTTAC TTATTTTTT TTATTTTTT AAAGGGGC TGAT

In [34]:
print('# tokens: ', len(tokenizer.tokenize(dataset['train'][0]['sequence'])))

# tokens:  576


### Dataset preprocessing
following HuggingFace text classification guide: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [44]:
def preprocess_function(examples):
  # just truncate right, but for some tasks symmetric truncation from left and right is more reasonable
  return tokenizer(examples["sequence"], truncation=True, max_length=512)

In [45]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10143 [00:00<?, ? examples/s]

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [46]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sequence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10143
    })
    test: Dataset({
        features: ['sequence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2536
    })
})

## Training

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': (predictions==labels).sum() / len(labels)}

# change training hyperparameters to archive better quality
training_args = TrainingArguments(
    output_dir=os.path.join(main_path, "test_run"),
    learning_rate=1e-4,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.1,
    optim='adamw_torch',
    weight_decay=0.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7127,0.69355,0.491325
2,0.7081,0.704086,0.508675
3,0.7088,0.693467,0.508675




In [None]:
dcfwc

## Get predictions from model on single example

In [None]:
x, y = dataset['test']['sequence'][0], dataset['test']['label'][0]

In [None]:
x_feat = tokenizer(x, return_tensors='pt')
x_feat.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
# move sample to gpu and feed to model
for k in x_feat:
  x_feat[k] = x_feat[k].cuda()

model = model.eval()
with torch.no_grad():
  out = model(**x_feat)
out



SequenceClassifierOutput(loss=None, logits=tensor([[-1.4048, -0.0459]], device='cuda:0'), hidden_states=None, attentions=None)

In [None]:
# get class probabilities
prob = torch.softmax(out['logits'], dim=-1)
prob

tensor([[0.2044, 0.7956]], device='cuda:0')

In [None]:
# get label
print(f'prediction: {torch.argmax(prob)}, label: {y}')

prediction: 1, label: 1
