In [1]:
import pandas as pd
import random

from classes.RebuildSeqs_GPT import RebuildSeqsGPT

In [14]:
df_train = pd.read_csv("datasets/RebuildSeqs_20k_small.csv", keep_default_na=False)
df_test = pd.read_csv("datasets/RebuildSeqs_2k_small.csv", keep_default_na=False)

train_sequence = df_train.iloc[:, 0].tolist()
train_builded = df_train.iloc[:, 1].tolist()
train_organism = df_train.iloc[:, 2].tolist()

test_sequence = df_test.iloc[:, 0].tolist()
test_builded = df_test.iloc[:, 1].tolist()
test_organism = df_test.iloc[:, 2].tolist()

In [3]:
model = RebuildSeqsGPT(checkpoint="models/ExInSeqs-GPT2-001", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="gpt2", log_level="info")

Started models/ExInSeqs-GPT2-001 model


In [15]:
model.add_train_data({
  "sequence": train_sequence,
  "builded": train_builded,
  "organism": train_organism,
},  batch_size=8, sequence_len=256)

In [5]:
model.add_test_data({
  "sequence": test_sequence,
  "builded": test_builded,
  "organism": test_organism,
},  batch_size=8, sequence_len=256)

In [16]:
model.train(lr=5e-5, epochs=5, save_at_end=False, save_freq=5)

Training Epoch 1/1:   0%|          | 0/250 [00:00<?, ?it/s]

In [17]:
def process_sequence( sequence):
	return f"".join(f"[{nucl.upper()}]" for nucl in sequence)

In [18]:
idx = 0
prompt = f"Sequence:{process_sequence(train_sequence[idx])}\nOrganism:{train_organism[idx]}\nMarked Sequence:"
print(prompt)

Sequence:[G][T][T][G][T][T][G][T][T][C][A][A][T][A][G][C][T][T][C][C][A][T][C][C][A][T][T][C][A][C][T][T][T][T][T][G][T][T][C][C][T][G][T][A][T][G][A][T][G][A][A][C][T][G][G][T][G][T][T][A][A][A][A][C][A][T][G][A][C][T][T][T][C][A][C][A][T][G][T][T][G][A][C][G][A][T][G][C][T][G][A][T][G][A][T][G][T][A][T][T][C][G][A][T][A][A][A][G][C][C][A][T][G][G][C][T][G][C][T][G][C][G][A][C][G][G][C][T][G][C][C][T][A][A][A][T][C][A][T][A][G][A][A][T][A][G][A][A][A][A][T][T][A][A][C][T][T][A][G][A][A][A][A][T][A][A][C][A][C][A][A][A][A][G][T][G][C][A][A][A][C][A][A][A][C][T][T][G][A][A][A][T][G][G][T][T][G][G][G][C][G][A][A][T][T][G][T][G][C][T][G][A][G][T][G][C][A][T][T][G][C][G][A][T][T][C][A][C][T][G][T][T][G][C][A][A][A][T][A][G][A][A][A]
Organism:Rotaria magnacalcarata
Marked Sequence:


In [19]:
sequence = model.tokenizer.encode(prompt, truncation=True, max_length=1024, add_special_tokens=True, padding=True, return_tensors="pt")

In [20]:
sequence = sequence.to("cuda")

In [21]:
attention_mask=[token != model.tokenizer.eos_token_id for token in sequence]

In [22]:
attention_mask = attention_mask[0].unsqueeze(0)

In [29]:
pred = model.model.generate(input_ids=sequence, attention_mask=attention_mask, repetition_penalty=0.4, temperature=0.5, top_k=50, max_new_tokens=512, pad_token_id=model.tokenizer.eos_token_id)

In [30]:
print(f"Prompt: {prompt}\n")
print(f"Target: {train_builded[idx]}\n")
result = model.tokenizer.decode(pred[0])
print(f"Result: {result}\n")

Prompt: Sequence:[G][T][T][G][T][T][G][T][T][C][A][A][T][A][G][C][T][T][C][C][A][T][C][C][A][T][T][C][A][C][T][T][T][T][T][G][T][T][C][C][T][G][T][A][T][G][A][T][G][A][A][C][T][G][G][T][G][T][T][A][A][A][A][C][A][T][G][A][C][T][T][T][C][A][C][A][T][G][T][T][G][A][C][G][A][T][G][C][T][G][A][T][G][A][T][G][T][A][T][T][C][G][A][T][A][A][A][G][C][C][A][T][G][G][C][T][G][C][T][G][C][G][A][C][G][G][C][T][G][C][C][T][A][A][A][T][C][A][T][A][G][A][A][T][A][G][A][A][A][A][T][T][A][A][C][T][T][A][G][A][A][A][A][T][A][A][C][A][C][A][A][A][A][G][T][G][C][A][A][A][C][A][A][A][C][T][T][G][A][A][A][T][G][G][T][T][G][G][G][C][G][A][A][T][T][G][T][G][C][T][G][A][G][T][G][C][A][T][T][G][C][G][A][T][T][C][A][C][T][G][T][T][G][C][A][A][A][T][A][G][A][A][A]
Organism:Rotaria magnacalcarata
Marked Sequence:

Target: (exon)CACCTGTTCTTCTTAAAGCAACTGTCATTGGAAAACCAACACCTCAT(exon)(intron)GTAAGATATCATGTACAAACAAACAAAAGCCTTTTTTCATTCATATTGCTTTTAG(intron)(exon)TTTATCTGGTTGAAAGATGCTGCTCCACTACCAGCTTCAAATCGTCTACGTACTCGTTA

In [27]:
# Predict Single

idx = random.randint(0, len(train_sequence))

print(f"Sequence to Predict: {train_sequence[idx]}")
print(f"Organism: {train_organism[idx]}")
print(f"True Ground: {train_builded[idx]}")
pred = model.predict_single({"sequence": sequence[idx]})
print(f"Prediction: {pred}")

Sequence to Predict: CACCTGTTCTTCTTAAAGCAACTGTCATTGGAAAACCAACACCTCATGTAAGATATCATGTACAAACAAACAAAAGCCTTTTTTCATTCATATTGCTTTTAGTTTATCTGGTTGAAAGATGCTGCTCCACTACCAGCTTCAAATCGTCTACGTACTCGTTATGATATCGGTACCAAACAAGTTTTATTACAAATAAATGATGCTCGCCCACAAGATATTGGGGAATATGTTGTAATCGCAACT
Organism: Rotaria magnacalcarata
True Ground: (exon)CACCTGTTCTTCTTAAAGCAACTGTCATTGGAAAACCAACACCTCAT(exon)(intron)GTAAGATATCATGTACAAACAAACAAAAGCCTTTTTTCATTCATATTGCTTTTAG(intron)(exon)TTTATCTGGTTGAAAGATGCTGCTCCACTACCAGCTTCAAATCGTCTACGTACTCGTTATGATATCGGTACCAAACAAGTTTTATTACAAATAAATGATGCTCGCCCACAAGATATTGGGGAATATGTTGTAATCGCAACT(exon)


IndexError: index 1979 is out of bounds for dimension 0 with size 1