In [None]:
import pandas as pd
import random

from BERT_based import SpliceBERT
from DNABERT_based import SpliceDNABERT
from GPT_based import SpliceGPT

## Datasets To Use

In [None]:
small_version = True

In [None]:
# Load 5M Seqs Dataset

if small_version:
	df = pd.read_csv("datasets/ExInSeqs_5M_small.csv", keep_default_na=False)
else:
	df = pd.read_csv("datasets/ExInSeqs_5M.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [None]:
# Load 100k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_100k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_100k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [None]:
# Load 30k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_30k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_30k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [None]:
# Load 3k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_3k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_3k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

## Loading Model

In [None]:
# From Scratch
model = SpliceGPT(checkpoint="gpt2", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="gpt", log_level="info")

In [None]:
# From Scratch
model = SpliceBERT(checkpoint="bert-base-uncased", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="bert", log_level="info")

In [None]:
# From Scratch
model = SpliceDNABERT(checkpoint="zhihan1996/DNA_bert_6", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="dnabert", log_level="info")

In [None]:
# From Checkpoint
model = SpliceGPT(checkpoint="gpt2", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="gpt2", log_level="info")

In [None]:
# From Checkpoint
splicebert = SpliceBERT(checkpoint="bert", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="bert", log_level="info")

In [None]:
# From Checkpoint
splicednabert = SpliceDNABERT(checkpoint="dnabert", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="dnabert", log_level="info")

## Setting Train/Test DataLoader

In [None]:
data_config = {
  "flanks_len": 10,
  "feat_hide_prob": 0.4,
}

In [None]:
model.add_train_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
},  batch_size=32, sequence_len=128, train_percentage=0.8, data_config=data_config)

## Updating Test DataLoader

In [None]:
# This can be used for generalization tests with completely new data

model.add_test_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
}, batch_size=32, sequence_len=128, data_config=data_config)

## Training Pipeline

In [None]:
model.train(lr=0.00005, epochs=1, save_at_end=False, evaluation=False, keep_best=False, save_freq=1)

## Eval Pipeline

In [None]:
model.evaluate()

## Prediction Pipeline

In [None]:
# Predict Single

idx = random.randint(0, len(sequence))

print(f"Sequence to Predict: {sequence[idx]}")
print(f"True Ground: {label[idx]}")
print(f"Gene: {gene[idx]}")
print(f"Organism: {organism[idx]}")
print(f"Flank Before: {flank_before[idx]}")
print(f"Flank After: {flank_after[idx]}")
pred = model.predict_single({"sequence": sequence[idx]})
print(f"Prediction: {pred}")