In [None]:
!pip install ipython-autotime
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.2
time: 328 µs (started: 2023-05-10 16:31:43 +00:00)


In [1]:
!pip install Bio
!pip install pyfastx
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from Bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.5.9 biopython-1.81 bio

In [2]:
import os
import pyfastx
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
import csv
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch

Для обучения модели был взят датасет MultipleCodonAlignments: https://drive.google.com/drive/folders/12cvmiDj1vX_Hqw9zvG5rRw1ACZoJQDOU?usp=sharing  
Для предобработки данных напишем вспомогательную функцию, преобразующую последовательность ДНК в массив 9-меров:

In [None]:
def str_to_kmer_lst(string, k):
  lst = []
  kmer = ''
  nucleotides = ['A', 'C', 'T', 'G']
  for i in range(len(string) - k + 1):
    if string[i] not in nucleotides:
      kmer = ''
      continue
    kmer += string[i]
    if len(kmer) == k:
      lst.append(kmer)
      kmer = kmer[1:]
  return lst

Далее пройдёмся по fasta-файлам из папки MCA. Посчитаем количество вхождений каждого участка из 9 нуклеотидов в файле. Для каждого fasta-файла получился csv-файл с подсчитанным количеством вхождений 9-меров.

In [None]:
root = './drive/MyDrive/MCA'
dirs = os.listdir(root)
k = 9
classificator_dict = []
generator_dict = []
class_one = []
for dir in dirs:
  count_dict = []
  fasta = pyfastx.Fasta(root + '/' + dir)
  kmers = []
  for sequence in fasta:
    kmers += str_to_kmer_lst(sequence, k)
  kmers_count = dict(Counter(kmers))
  keys, values = list(kmers_count.keys()), list(kmers_count.values())
  for i in range(len(keys)):
    count_dict.append({'kmer': keys[i], 'count': values[i]})
  df = pd.DataFrame.from_dict(count_dict)
  df.to_csv ('stat/' + '_'.join(dir.split('.')[:-2]) + '.csv', index=False, sep=';')

Пройдёмся по полученным csv-файлам. Если количество вхождений участка больше четверти максимального количества вхождений 9-меров в данном файле, то присваиваем ему класс 1 (консервативный участок), иначе 0. Участки класса 1 записываем в датасет генератора: первым 8 нуклеотидам соответствует 9-й. Результаты сохраняем в classificator_dataset и generator_dataset. 

In [None]:
root = './drive/MyDrive/stat/'
files = os.listdir(root)
classificator_dict = []
generator_dict = []
for i in range(50):
  print(i)
  kmers_count = pd.read_csv(root + files[i], sep=';')
  for i in range(len(kmers_count)):
    if kmers_count['count'][i] > max(kmers_count['count']) // 4:
      classificator_dict.append({'kmer': kmers_count['kmer'][i], 'label': 1})
      generator_dict.append({'kmer': kmers_count['kmer'][i][:-1], 'label': kmers_count['kmer'][i][-1]})
    else:
      classificator_dict.append({'kmer': kmers_count['kmer'][i], 'label': 0})
df = pd.DataFrame.from_dict(generator_dict)
df.to_csv('stat/generator_dataset.csv', index=False, sep=';')
df = pd.DataFrame.from_dict(classificator_dict)
df.to_csv('stat/classificator_dataset.csv', index=False, sep=';')

Загрузим данные из предобработанных и размеченных датасетов.

In [None]:
classificatorPath = 'classificator_dataset.csv'
generatorPath = 'generator_dataset.csv'
n = 1000
classification_texts, classification_labels = [], []
generation_texts, generation_labels = [], []
ones, zeros = 0, 0
with open(classificatorPath) as csvFile:
  csvReader = csv.DictReader(csvFile, delimiter=';')
  for row in csvReader:
    if zeros == n and ones == n:
      break
    if int(row['label']) == 0 and zeros < n:
      zeros += 1
    elif int(row['label']) == 1 and ones < n:
      ones += 1
    else:
      continue
    classification_texts.append(row['kmer'])
    classification_labels.append(int(row['label']))
ones = 0
with open(generatorPath) as csvFile:
  csvReader = csv.DictReader(csvFile, delimiter=';')
  for row in csvReader:
    if ones == 2*n:
      break
    generation_texts.append(row['kmer'])
    generation_labels.append(row['label'])
    ones += 1

time: 101 ms (started: 2023-04-18 19:01:42 +00:00)


Нам нужно обучить модель распознавать участки, похожие на консвервативные. Поэтому в качестве тестового датасета будем использовать участки, на которых обучалась модель, с 1-2 мутациями.

In [None]:
import random
def add_mismatches(texts):
  res = []
  nucleotides = ['A', 'T', 'C', 'G']
  for text in texts:
    mod_text = text
    i = random.randint(0, 8)
    mod_text = mod_text[:i] + nucleotides[random.randint(0, 3)] + mod_text[i+1:]
    i = random.randint(0, 8)
    mod_text = mod_text[:i] + nucleotides[random.randint(0, 3)] + mod_text[i+1:]
    res.append(mod_text)
  return res
test_texts, test_labels = add_mismatches(classification_texts), classification_labels

time: 13.5 ms (started: 2023-04-18 19:01:44 +00:00)


Разобьём выборку на тренировочную и валидационную (в качестве валидационной возьмём 20% выборки)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(classification_texts, classification_labels, test_size=.2)

time: 7.77 ms (started: 2023-04-18 19:01:44 +00:00)


В качестве модели используем трансформер DistilBert

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

time: 1.52 s (started: 2023-04-18 17:55:24 +00:00)


Токенизируем текстовые данные

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

time: 244 ms (started: 2023-04-18 17:55:28 +00:00)


In [7]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

time: 490 µs (started: 2023-04-18 17:55:33 +00:00)


Залогинимся в huggingface

In [None]:
from transformers import logging
logging.set_verbosity_error()

time: 955 µs (started: 2023-04-18 19:02:16 +00:00)


Обучим модель

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]



{'loss': 0.7004, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7023, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6981, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6898, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6855, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6682, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6516, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.615, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6392, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.5614, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.5827, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}
{'loss': 0.5525, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.5702, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}
{'loss': 0.5696, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}
{'loss': 0.552, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.4584, 'learning_rate': 1.6000

TrainOutput(global_step=300, training_loss=0.5673265345891316, metrics={'train_runtime': 274.3611, 'train_samples_per_second': 17.495, 'train_steps_per_second': 1.093, 'train_loss': 0.5673265345891316, 'epoch': 3.0})

time: 4min 39s (started: 2023-04-18 17:55:48 +00:00)


Протестируем модель

In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]

time: 1min 12s (started: 2023-04-18 18:00:28 +00:00)


In [None]:
from sklearn.metrics import classification_report

target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.64      0.86      0.73      1000
    conservative       0.79      0.51      0.62      1000

        accuracy                           0.69      2000
       macro avg       0.71      0.69      0.68      2000
    weighted avg       0.71      0.69      0.68      2000

time: 12.6 ms (started: 2023-04-18 18:01:41 +00:00)


Как видно, показатели качества неидеальны. Попробуем поиграться с параметрами

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.05,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6895, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6929, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6891, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6862, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6732, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6507, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6212, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.5987, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6426, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.5539, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.5573, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}
{'loss': 0.5531, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.555, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}
{'loss': 0.5503, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}
{'loss': 0.5207, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.4371, 'learning_rate': 1.600

TrainOutput(global_step=300, training_loss=0.5637805143992106, metrics={'train_runtime': 292.9455, 'train_samples_per_second': 16.385, 'train_steps_per_second': 1.024, 'train_loss': 0.5637805143992106, 'epoch': 3.0})

time: 4min 53s (started: 2023-04-18 18:01:41 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]

time: 1min 20s (started: 2023-04-18 18:06:34 +00:00)


In [None]:
from sklearn.metrics import classification_report

target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.62      0.89      0.73      1000
    conservative       0.80      0.45      0.58      1000

        accuracy                           0.67      2000
       macro avg       0.71      0.67      0.65      2000
    weighted avg       0.71      0.67      0.65      2000

time: 11.5 ms (started: 2023-04-18 18:07:55 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6895, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6929, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6891, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6862, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6732, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6507, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6212, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.5987, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6425, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.5541, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.5573, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}
{'loss': 0.553, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.5547, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}
{'loss': 0.5504, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}
{'loss': 0.5207, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.4372, 'learning_rate': 1.600

TrainOutput(global_step=300, training_loss=0.5637061134974162, metrics={'train_runtime': 259.4139, 'train_samples_per_second': 18.503, 'train_steps_per_second': 1.156, 'train_loss': 0.5637061134974162, 'epoch': 3.0})

time: 4min 20s (started: 2023-04-18 18:07:55 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]

time: 1min 10s (started: 2023-04-18 18:12:16 +00:00)


In [None]:
from sklearn.metrics import classification_report

target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.62      0.89      0.73      1000
    conservative       0.80      0.45      0.58      1000

        accuracy                           0.67      2000
       macro avg       0.71      0.67      0.65      2000
    weighted avg       0.71      0.67      0.65      2000

time: 7.65 ms (started: 2023-04-18 18:13:27 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6895, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6929, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6891, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6862, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6732, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6507, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6212, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.5986, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6426, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.5539, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.5569, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}
{'loss': 0.5534, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.5544, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}
{'loss': 0.5506, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}
{'loss': 0.5215, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.4386, 'learning_rate': 1.60

TrainOutput(global_step=300, training_loss=0.5640222454071044, metrics={'train_runtime': 254.7882, 'train_samples_per_second': 18.839, 'train_steps_per_second': 1.177, 'train_loss': 0.5640222454071044, 'epoch': 3.0})

time: 4min 15s (started: 2023-04-18 18:13:27 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.62      0.89      0.73      1000
    conservative       0.81      0.45      0.58      1000

        accuracy                           0.67      2000
       macro avg       0.71      0.67      0.65      2000
    weighted avg       0.71      0.67      0.65      2000

time: 1min 11s (started: 2023-04-18 18:17:42 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6891, 'learning_rate': 5e-06, 'epoch': 0.1}
{'loss': 0.6886, 'learning_rate': 1e-05, 'epoch': 0.2}
{'loss': 0.6579, 'learning_rate': 1.5e-05, 'epoch': 0.3}
{'loss': 0.6369, 'learning_rate': 2e-05, 'epoch': 0.4}
{'loss': 0.5711, 'learning_rate': 2.5e-05, 'epoch': 0.5}
{'loss': 0.5072, 'learning_rate': 3e-05, 'epoch': 0.6}
{'loss': 0.5284, 'learning_rate': 3.5e-05, 'epoch': 0.7}
{'loss': 0.5448, 'learning_rate': 4e-05, 'epoch': 0.8}
{'loss': 0.6148, 'learning_rate': 4.5e-05, 'epoch': 0.9}
{'loss': 0.5472, 'learning_rate': 5e-05, 'epoch': 1.0}
{'loss': 0.5132, 'learning_rate': 4.75e-05, 'epoch': 1.1}
{'loss': 0.5204, 'learning_rate': 4.5e-05, 'epoch': 1.2}
{'loss': 0.5904, 'learning_rate': 4.25e-05, 'epoch': 1.3}
{'loss': 0.5728, 'learning_rate': 4e-05, 'epoch': 1.4}
{'loss': 0.4997, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.5}
{'loss': 0.4023, 'learning_rate': 3.5e-05, 'epoch': 1.6}
{'loss': 0.5583, 'learning_rate': 3.2500000000000004e-05, 'epoch': 1.7}
{'loss': 0.55

TrainOutput(global_step=300, training_loss=0.5190047597885132, metrics={'train_runtime': 240.1011, 'train_samples_per_second': 19.992, 'train_steps_per_second': 1.249, 'train_loss': 0.5190047597885132, 'epoch': 3.0})

time: 4min (started: 2023-04-18 18:18:54 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.73      0.69      0.71      1000
    conservative       0.70      0.74      0.72      1000

        accuracy                           0.72      2000
       macro avg       0.72      0.72      0.72      2000
    weighted avg       0.72      0.72      0.72      2000

time: 1min 12s (started: 2023-04-18 18:22:55 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6846, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6285, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.5502, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.591, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5014, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.508, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5528, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5511, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6388, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5513, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.4849, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5414, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.5018, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.4902, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.4533, 'l

TrainOutput(global_step=300, training_loss=0.4954576206207275, metrics={'train_runtime': 256.2142, 'train_samples_per_second': 18.734, 'train_steps_per_second': 1.171, 'train_loss': 0.4954576206207275, 'epoch': 3.0})

time: 4min 16s (started: 2023-04-18 18:24:07 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.72      0.71      1000
    conservative       0.71      0.70      0.71      1000

        accuracy                           0.71      2000
       macro avg       0.71      0.71      0.71      2000
    weighted avg       0.71      0.71      0.71      2000

time: 1min 11s (started: 2023-04-18 18:28:24 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6843, 'learning_rate': 5e-05, 'epoch': 0.2}
{'loss': 0.5954, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.4}
{'loss': 0.5015, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.6}
{'loss': 0.5663, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.8}
{'loss': 0.5658, 'learning_rate': 3.571428571428572e-05, 'epoch': 1.0}
{'loss': 0.5469, 'learning_rate': 3.2142857142857144e-05, 'epoch': 1.2}
{'loss': 0.5087, 'learning_rate': 2.857142857142857e-05, 'epoch': 1.4}
{'loss': 0.4089, 'learning_rate': 2.5e-05, 'epoch': 1.6}
{'loss': 0.5494, 'learning_rate': 2.1428571428571428e-05, 'epoch': 1.8}
{'loss': 0.4936, 'learning_rate': 1.785714285714286e-05, 'epoch': 2.0}
{'loss': 0.4766, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.2}
{'loss': 0.4098, 'learning_rate': 1.0714285714285714e-05, 'epoch': 2.4}
{'loss': 0.4168, 'learning_rate': 7.142857142857143e-06, 'epoch': 2.6}
{'loss': 0.4477, 'learning_rate': 3.5714285714285714e-06, 'epoch': 2.8}
{'loss': 0.4511, 'learning_rat

TrainOutput(global_step=150, training_loss=0.5081844520568848, metrics={'train_runtime': 185.7907, 'train_samples_per_second': 25.836, 'train_steps_per_second': 0.807, 'train_loss': 0.5081844520568848, 'epoch': 3.0})

time: 3min 6s (started: 2023-04-18 18:29:35 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.70      0.74      0.72      1000
    conservative       0.72      0.69      0.71      1000

        accuracy                           0.71      2000
       macro avg       0.71      0.71      0.71      2000
    weighted avg       0.71      0.71      0.71      2000

time: 1min 11s (started: 2023-04-18 18:32:41 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6895, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6465, 'learning_rate': 4.89795918367347e-05, 'epoch': 0.2}
{'loss': 0.5652, 'learning_rate': 4.795918367346939e-05, 'epoch': 0.3}
{'loss': 0.5972, 'learning_rate': 4.6938775510204086e-05, 'epoch': 0.4}
{'loss': 0.558, 'learning_rate': 4.591836734693878e-05, 'epoch': 0.5}
{'loss': 0.4867, 'learning_rate': 4.4897959183673474e-05, 'epoch': 0.6}
{'loss': 0.57, 'learning_rate': 4.387755102040816e-05, 'epoch': 0.7}
{'loss': 0.6171, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.8}
{'loss': 0.6352, 'learning_rate': 4.183673469387756e-05, 'epoch': 0.9}
{'loss': 0.5284, 'learning_rate': 4.0816326530612245e-05, 'epoch': 1.0}
{'loss': 0.5405, 'learning_rate': 3.979591836734694e-05, 'epoch': 1.1}
{'loss': 0.5432, 'learning_rate': 3.8775510204081634e-05, 'epoch': 1.2}
{'loss': 0.5384, 'learning_rate': 3.775510204081633e-05, 'epoch': 1.3}
{'loss': 0.5529, 'learning_rate': 3.673469387755102e-05, 'epoch': 1.4}
{'loss': 0.4774, 'lea

TrainOutput(global_step=500, training_loss=0.42608152151107787, metrics={'train_runtime': 424.2447, 'train_samples_per_second': 18.857, 'train_steps_per_second': 1.179, 'train_loss': 0.42608152151107787, 'epoch': 5.0})

time: 7min 5s (started: 2023-04-18 18:33:53 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.74      0.73      1000
    conservative       0.73      0.70      0.72      1000

        accuracy                           0.72      2000
       macro avg       0.72      0.72      0.72      2000
    weighted avg       0.72      0.72      0.72      2000

time: 1min 13s (started: 2023-04-18 18:40:58 +00:00)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6917, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.624, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.5604, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.6063, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5146, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.4886, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5358, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5449, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6241, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5385, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.4995, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5613, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.5209, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.5237, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.4598, '

TrainOutput(global_step=300, training_loss=0.5034126869837443, metrics={'train_runtime': 237.6521, 'train_samples_per_second': 20.198, 'train_steps_per_second': 1.262, 'train_loss': 0.5034126869837443, 'epoch': 3.0})

time: 3min 58s (started: 2023-04-18 18:42:12 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.72      0.70      0.71      1000
    conservative       0.71      0.72      0.72      1000

        accuracy                           0.71      2000
       macro avg       0.71      0.71      0.71      2000
    weighted avg       0.71      0.71      0.71      2000

time: 1min 9s (started: 2023-04-18 18:46:10 +00:00)


Параметры не сильно повлияли на качество модели. Попробуем другие модели

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

time: 1.21 s (started: 2023-04-18 18:47:53 +00:00)


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6516, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6181, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.5916, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.6622, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5254, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.553, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5537, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5554, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.5994, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5704, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.4963, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5456, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.5666, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.541, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.481, 'le

TrainOutput(global_step=300, training_loss=0.5123630452156067, metrics={'train_runtime': 508.3799, 'train_samples_per_second': 9.442, 'train_steps_per_second': 0.59, 'train_loss': 0.5123630452156067, 'epoch': 3.0})

time: 8min 30s (started: 2023-04-18 18:47:57 +00:00)


In [None]:
from sklearn.metrics import classification_report

predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.68      0.70      1000
    conservative       0.69      0.72      0.71      1000

        accuracy                           0.70      2000
       macro avg       0.70      0.70      0.70      2000
    weighted avg       0.70      0.70      0.70      2000

time: 2min 38s (started: 2023-04-18 18:56:27 +00:00)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

time: 778 ms (started: 2023-04-18 19:02:27 +00:00)


In [None]:
from transformers import BloomForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 23.2242, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 5.212, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 2.7188, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 2.7897, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 1.2017, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 1.4802, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 2.3233, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.8841, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.9665, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.9853, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.7398, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.8139, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.9034, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.8334, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.7221, 

TrainOutput(global_step=300, training_loss=1.9339380041758218, metrics={'train_runtime': 1969.879, 'train_samples_per_second': 2.437, 'train_steps_per_second': 0.152, 'train_loss': 1.9339380041758218, 'epoch': 3.0})

time: 33min 9s (started: 2023-04-18 19:02:31 +00:00)


In [None]:
from sklearn.metrics import classification_report
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.61      0.82      0.70      1000
    conservative       0.73      0.47      0.57      1000

        accuracy                           0.65      2000
       macro avg       0.67      0.65      0.64      2000
    weighted avg       0.67      0.65      0.64      2000

time: 7min 22s (started: 2023-04-18 19:46:55 +00:00)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("valhalla/bart-large-sst2")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

time: 1.84 s (started: 2023-04-18 19:54:17 +00:00)


In [None]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = BartForSequenceClassification.from_pretrained("valhalla/bart-large-sst2")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]



{'loss': 0.8159, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.639, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.6435, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.5861, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5578, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.529, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5921, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5252, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6323, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5983, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.4616, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5031, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.4335, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.3727, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.5288, 'l

TrainOutput(global_step=300, training_loss=0.504190403620402, metrics={'train_runtime': 1709.9256, 'train_samples_per_second': 2.807, 'train_steps_per_second': 0.175, 'train_loss': 0.504190403620402, 'epoch': 3.0})

time: 28min 49s (started: 2023-04-18 19:54:19 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.72      0.69      0.70      1000
    conservative       0.70      0.73      0.71      1000

        accuracy                           0.71      2000
       macro avg       0.71      0.71      0.71      2000
    weighted avg       0.71      0.71      0.71      2000

time: 9min 46s (started: 2023-04-18 20:23:08 +00:00)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


time: 1.17 s (started: 2023-04-18 20:32:55 +00:00)


In [None]:
from transformers import CanineForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = CanineForSequenceClassification.from_pretrained("google/canine-s")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/529M [00:00<?, ?B/s]



{'loss': 0.7, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.7082, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.6918, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.7039, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.6945, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.7048, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.7003, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.6942, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6978, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.694, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.6928, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.6916, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.6838, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.6759, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.7348, 'lea

TrainOutput(global_step=300, training_loss=0.6753371127446492, metrics={'train_runtime': 323.4668, 'train_samples_per_second': 14.839, 'train_steps_per_second': 0.927, 'train_loss': 0.6753371127446492, 'epoch': 3.0})

time: 5min 27s (started: 2023-04-18 20:32:56 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.61      0.67      0.64      1000
    conservative       0.64      0.57      0.60      1000

        accuracy                           0.62      2000
       macro avg       0.63      0.62      0.62      2000
    weighted avg       0.63      0.62      0.62      2000

time: 2min 31s (started: 2023-04-18 20:38:23 +00:00)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

time: 1.18 s (started: 2023-04-18 20:40:55 +00:00)


In [None]:
from transformers import EsmForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/31.4M [00:00<?, ?B/s]



{'loss': 0.6889, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6743, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.6155, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.5739, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5612, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.5225, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5632, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5048, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6193, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5502, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.4917, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5791, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.486, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.492, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.5604, 'l

TrainOutput(global_step=300, training_loss=0.5280061499277751, metrics={'train_runtime': 53.3956, 'train_samples_per_second': 89.895, 'train_steps_per_second': 5.618, 'train_loss': 0.5280061499277751, 'epoch': 3.0})

time: 54.4 s (started: 2023-04-18 20:40:56 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.69      0.70      1000
    conservative       0.70      0.72      0.71      1000

        accuracy                           0.70      2000
       macro avg       0.71      0.70      0.70      2000
    weighted avg       0.71      0.70      0.70      2000

time: 28.4 s (started: 2023-04-18 20:41:50 +00:00)


In [None]:
from transformers import EsmForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6879, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6552, 'learning_rate': 4.89795918367347e-05, 'epoch': 0.2}
{'loss': 0.5907, 'learning_rate': 4.795918367346939e-05, 'epoch': 0.3}
{'loss': 0.5678, 'learning_rate': 4.6938775510204086e-05, 'epoch': 0.4}
{'loss': 0.5416, 'learning_rate': 4.591836734693878e-05, 'epoch': 0.5}
{'loss': 0.5037, 'learning_rate': 4.4897959183673474e-05, 'epoch': 0.6}
{'loss': 0.5411, 'learning_rate': 4.387755102040816e-05, 'epoch': 0.7}
{'loss': 0.5048, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.8}
{'loss': 0.6298, 'learning_rate': 4.183673469387756e-05, 'epoch': 0.9}
{'loss': 0.5411, 'learning_rate': 4.0816326530612245e-05, 'epoch': 1.0}
{'loss': 0.4885, 'learning_rate': 3.979591836734694e-05, 'epoch': 1.1}
{'loss': 0.5655, 'learning_rate': 3.8775510204081634e-05, 'epoch': 1.2}
{'loss': 0.4787, 'learning_rate': 3.775510204081633e-05, 'epoch': 1.3}
{'loss': 0.497, 'learning_rate': 3.673469387755102e-05, 'epoch': 1.4}
{'loss': 0.5357, 'l

TrainOutput(global_step=500, training_loss=0.47617480182647703, metrics={'train_runtime': 87.7082, 'train_samples_per_second': 91.212, 'train_steps_per_second': 5.701, 'train_loss': 0.47617480182647703, 'epoch': 5.0})

time: 1min 27s (started: 2023-04-18 20:42:45 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.74      0.72      1000
    conservative       0.72      0.69      0.71      1000

        accuracy                           0.71      2000
       macro avg       0.71      0.71      0.71      2000
    weighted avg       0.71      0.71      0.71      2000

time: 28.2 s (started: 2023-04-18 20:44:13 +00:00)


In [None]:
from transformers import EsmForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=7,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6847, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6635, 'learning_rate': 4.9275362318840584e-05, 'epoch': 0.2}
{'loss': 0.5995, 'learning_rate': 4.855072463768116e-05, 'epoch': 0.3}
{'loss': 0.5647, 'learning_rate': 4.782608695652174e-05, 'epoch': 0.4}
{'loss': 0.569, 'learning_rate': 4.710144927536232e-05, 'epoch': 0.5}
{'loss': 0.5029, 'learning_rate': 4.63768115942029e-05, 'epoch': 0.6}
{'loss': 0.5808, 'learning_rate': 4.565217391304348e-05, 'epoch': 0.7}
{'loss': 0.5135, 'learning_rate': 4.492753623188406e-05, 'epoch': 0.8}
{'loss': 0.6136, 'learning_rate': 4.4202898550724645e-05, 'epoch': 0.9}
{'loss': 0.5284, 'learning_rate': 4.347826086956522e-05, 'epoch': 1.0}
{'loss': 0.4989, 'learning_rate': 4.27536231884058e-05, 'epoch': 1.1}
{'loss': 0.6092, 'learning_rate': 4.202898550724638e-05, 'epoch': 1.2}
{'loss': 0.5242, 'learning_rate': 4.130434782608696e-05, 'epoch': 1.3}
{'loss': 0.5023, 'learning_rate': 4.057971014492754e-05, 'epoch': 1.4}
{'loss': 0.5565, 'learn

TrainOutput(global_step=700, training_loss=0.4002031067439488, metrics={'train_runtime': 122.8147, 'train_samples_per_second': 91.194, 'train_steps_per_second': 5.7, 'train_loss': 0.4002031067439488, 'epoch': 7.0})

time: 2min 3s (started: 2023-04-18 20:44:41 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.72      0.74      0.73      1000
    conservative       0.73      0.70      0.72      1000

        accuracy                           0.73      2000
       macro avg       0.73      0.72      0.72      2000
    weighted avg       0.73      0.72      0.72      2000

time: 28.2 s (started: 2023-04-18 20:46:44 +00:00)


Данные параметры выдают наилучший результат, так что будем использовать их

In [None]:
from transformers import EsmForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



{'loss': 0.6846, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 0.6569, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.2}
{'loss': 0.59, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3}
{'loss': 0.5794, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.4}
{'loss': 0.5568, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.5}
{'loss': 0.5319, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.6}
{'loss': 0.5407, 'learning_rate': 3.965517241379311e-05, 'epoch': 0.7}
{'loss': 0.5083, 'learning_rate': 3.793103448275862e-05, 'epoch': 0.8}
{'loss': 0.6158, 'learning_rate': 3.620689655172414e-05, 'epoch': 0.9}
{'loss': 0.5418, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.0}
{'loss': 0.5017, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.1}
{'loss': 0.5769, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.2}
{'loss': 0.4835, 'learning_rate': 2.9310344827586206e-05, 'epoch': 1.3}
{'loss': 0.4856, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.4}
{'loss': 0.5309, 'l

TrainOutput(global_step=300, training_loss=0.5249180555343628, metrics={'train_runtime': 52.2834, 'train_samples_per_second': 91.807, 'train_steps_per_second': 5.738, 'train_loss': 0.5249180555343628, 'epoch': 3.0})

time: 52.5 s (started: 2023-04-18 20:47:12 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['non-conservative', 'conservative']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

                  precision    recall  f1-score   support

non-conservative       0.71      0.69      0.70      1000
    conservative       0.70      0.72      0.71      1000

        accuracy                           0.70      2000
       macro avg       0.70      0.70      0.70      2000
    weighted avg       0.70      0.70      0.70      2000

time: 28.5 s (started: 2023-04-18 20:48:05 +00:00)


В нашем случае генерацию последовательности можно свести к задаче классификации на 4 класса, где каждый класс - это следующий нуклеотид

In [None]:
id2label = {0: "A", 1: "C", 2: "G", 3: "T"}
label2id = {"A": 0, "C": 1, "G": 2, "T": 3}

time: 663 µs (started: 2023-04-18 20:49:16 +00:00)


In [None]:
for i in range(len(generation_labels)):
  generation_labels[i] = label2id[generation_labels[i]]

time: 865 µs (started: 2023-04-18 20:54:36 +00:00)


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(generation_texts, generation_labels, test_size=.2)

time: 6.36 ms (started: 2023-04-18 20:54:39 +00:00)


In [None]:
test_texts, test_labels = add_mismatches(generation_texts), generation_labels

time: 14.6 ms (started: 2023-04-18 20:54:45 +00:00)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

time: 315 ms (started: 2023-04-18 20:54:47 +00:00)


In [None]:
from transformers import EsmForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=7,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    logging_steps=10,
)

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=4, id2label=id2label, label2id=label2id)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

{'loss': 1.3835, 'learning_rate': 5e-05, 'epoch': 0.1}
{'loss': 1.377, 'learning_rate': 4.9275362318840584e-05, 'epoch': 0.2}
{'loss': 1.3847, 'learning_rate': 4.855072463768116e-05, 'epoch': 0.3}
{'loss': 1.3794, 'learning_rate': 4.782608695652174e-05, 'epoch': 0.4}
{'loss': 1.3651, 'learning_rate': 4.710144927536232e-05, 'epoch': 0.5}
{'loss': 1.3446, 'learning_rate': 4.63768115942029e-05, 'epoch': 0.6}
{'loss': 1.3644, 'learning_rate': 4.565217391304348e-05, 'epoch': 0.7}
{'loss': 1.3558, 'learning_rate': 4.492753623188406e-05, 'epoch': 0.8}
{'loss': 1.349, 'learning_rate': 4.4202898550724645e-05, 'epoch': 0.9}
{'loss': 1.3776, 'learning_rate': 4.347826086956522e-05, 'epoch': 1.0}
{'loss': 1.36, 'learning_rate': 4.27536231884058e-05, 'epoch': 1.1}
{'loss': 1.357, 'learning_rate': 4.202898550724638e-05, 'epoch': 1.2}
{'loss': 1.3628, 'learning_rate': 4.130434782608696e-05, 'epoch': 1.3}
{'loss': 1.3915, 'learning_rate': 4.057971014492754e-05, 'epoch': 1.4}
{'loss': 1.382, 'learning_r

TrainOutput(global_step=700, training_loss=1.3305972685132708, metrics={'train_runtime': 118.9889, 'train_samples_per_second': 94.126, 'train_steps_per_second': 5.883, 'train_loss': 1.3305972685132708, 'epoch': 7.0})

time: 1min 59s (started: 2023-04-18 20:55:16 +00:00)


In [None]:
predicted_labels = [model(**tokenizer(kmer, return_tensors="pt")).logits.argmax().item() for kmer in test_texts]
target_names = ['A', 'C', 'G', 'T']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

              precision    recall  f1-score   support

           A       0.38      0.67      0.48       658
           C       0.27      0.10      0.15       360
           G       0.32      0.41      0.36       442
           T       0.25      0.06      0.10       540

    accuracy                           0.35      2000
   macro avg       0.30      0.31      0.27      2000
weighted avg       0.31      0.35      0.29      2000

time: 41 s (started: 2023-04-18 20:58:56 +00:00)
