In [1]:
!pip install -qq transformers datasets Bio

[K     |████████████████████████████████| 4.2 MB 4.0 MB/s 
[K     |████████████████████████████████| 346 kB 57.0 MB/s 
[K     |████████████████████████████████| 270 kB 65.7 MB/s 
[K     |████████████████████████████████| 596 kB 58.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 23.5 MB/s 
[K     |████████████████████████████████| 86 kB 4.2 MB/s 
[K     |████████████████████████████████| 86 kB 6.9 MB/s 
[K     |████████████████████████████████| 212 kB 58.2 MB/s 
[K     |████████████████████████████████| 140 kB 25.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.4 MB/s 
[K     |████████████████████████████████| 127 kB 62.8 MB/s 
[K     |████████████████████████████████| 2.3 MB 57.5 MB/s 
[K     |████████████████████████████████| 271 kB 54.7 MB/s 
[K     |████████████████████████████████| 94 kB 1.8 MB/s 
[K     |████████████████████████████████| 144 kB 54.6 MB/s 
[K     |████████████████████████████████| 112 kB 48.2 MB/s 
[31mERROR: pip's dependency re

In [2]:
def kmers(s, k=6):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

kmers("ACTGACTA", 3)


"ACACACACACACAC"
"ACTGACAGATTAGA"

'ACTGACAGATTAGA'

In [4]:
import pandas as pd

url = 'https://drive.google.com/file/d/1rTNHSVJUM5tV4TFW9vqV4rlF08tjCC2u/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
df

Unnamed: 0,id,chr,random_start,random_end,seq
0,ENST00000552870,12,116,315,AAAACAAGGACTGCAGCCTAAATTCCAAATACCAGAGACTGAAATT...
1,ENST00000496961,1,2645,2844,GGAGTCATTTATATTCTGCAGGAGGAAGGGGCCCCAGCTGTCGCCT...
2,ENST00000355654,2,428,627,GTAGAGGGTGTTTTCACCTTCCAAGACATGGGGCAAAGTTTGGAGA...
3,ENST00000616793,5,3835,4034,GGGTTATCACTTAGGTAGAGAGCAAATGTGTTCTCCACTAGTAATA...
4,ENST00000541924,12,508,707,CCCCACGGTGAGCGCCCTGTGCCCCACACAGCAGGAGATGATGATA...
...,...,...,...,...,...
49995,ENST00000314289,4,1400,1599,CAGTTTACCCTTCTGAAGGAGCAGGGACTCAGCACAGAATTCACTT...
49996,ENST00000294964,2,557,756,CAGGGGTCAGGGGCAGAGGTGCACACCTCAGCATGAGCCAAGACTG...
49997,ENST00000464456,3,1103,1302,CCATATTTTATGCTGGTTGTCTGCAAGCTTGTGCGATGTTATGTTC...
49998,ENST00000503332,5,24,223,CACCACCTTCTAAGTCACGTCACCATGAGATGCCAAGGGAGTACAA...


In [6]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'chr', 'random_start', 'random_end', 'seq'],
    num_rows: 50000
})

## 3) Tokenization for 3' UTR

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def tokenize_function(s, k=6):
  seq_split = " ".join(kmers(s['seq'], k))
  return tokenizer(seq_split)

tokenize_function({'seq':'ACCTGCTGGACGATCATA'})  

{'input_ids': [2, 675, 2000, 393, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [14]:
tokenized_datasets = ds.map(tokenize_function, remove_columns=list(df.columns))
tokenized_datasets

  0%|          | 0/50000 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50000
})

In [15]:
len(tokenized_datasets[0]['input_ids'])

35

In [16]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("simecek/DNADebertaSmall")

Downloading:   0%|          | 0.00/704 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/178M [00:00<?, ?B/s]

## 4) Evaluation for 3' UTR

In [17]:
from transformers import TrainingArguments, Trainer

fake_training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,   
    per_device_eval_batch_size=64,   
    max_steps=10,            # number of steps - to be the same
)

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [19]:
trainer = Trainer(
    model=model,
    args=fake_training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

max_steps is given, it will override any value given in num_train_epochs


In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


{'eval_loss': 7.04788064956665,
 'eval_runtime': 16.6458,
 'eval_samples_per_second': 3003.763,
 'eval_steps_per_second': 46.979}

## 5) Let us automate it

In [23]:
DATA = [('itergenomic', 'https://drive.google.com/file/d/1IvsEckVher9lAcZh47Q7M7at86phqxPG/view?usp=sharing'),
        ('transcripts', 'https://drive.google.com/file/d/1i1MrkXOUk_1Cii5T-_HzisxwDInEvFis/view?usp=sharing'),
        ('exons', 'https://drive.google.com/file/d/1XvRv7vHa1dGHESHwyuD1k_XSt8ftF1G5/view?usp=sharing'),
  #      ('introns', 'https://drive.google.com/file/d/1GWrprEspv18uZiJNlNpgiaNL0r8SCtAf/view?usp=sharing'),
        ('random', 'https://drive.google.com/file/d/1dArpvpN2C6Xq_S3owMy7qYx7K6cPKdSm/view?usp=sharing'),
        ('3utr', 'https://drive.google.com/file/d/1rTNHSVJUM5tV4TFW9vqV4rlF08tjCC2u/view?usp=sharing')
       ]

In [25]:
results = []

for seqtype, url in DATA:
  print(seqtype)

  path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
  df = pd.read_csv(path)
  ds = Dataset.from_pandas(df)
  tokenized_datasets = ds.map(tokenize_function, remove_columns=list(df.columns))

  trainer = Trainer(
    model=model,
    args=fake_training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
  )

  eval = trainer.evaluate()

  results.append((seqtype, eval['eval_loss']))


itergenomic


  0%|          | 0/50000 [00:00<?, ?ex/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


transcripts


  0%|          | 0/50000 [00:00<?, ?ex/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


exons


  0%|          | 0/50000 [00:00<?, ?ex/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


random


  0%|          | 0/50000 [00:00<?, ?ex/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


3utr


  0%|          | 0/50000 [00:00<?, ?ex/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 64


In [27]:
import pandas as pd

results_long = pd.DataFrame.from_records(results, columns=["seqtype", "loss"])
results_long

Unnamed: 0,seqtype,loss
0,itergenomic,6.069104
1,transcripts,7.381013
2,exons,7.093152
3,random,6.08887
4,3utr,7.047881
