In [1]:
import pandas as pd
import random

from BERT_based import SpliceBERT, SpliceDNABERT
from GPT_based import SpliceGPT

## Datasets To Use

In [2]:
small_version = True

In [9]:
# Load 11M Seqs Dataset

df = pd.read_csv("datasets/ExInSeqs_11M.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [2]:
# Load 100k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_100k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_100k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [None]:
# Load 30k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_30k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_30k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [3]:
# Load 3k Seqs Dataset
if small_version:
  df = pd.read_csv("datasets/ExInSeqs_3k_small.csv", keep_default_na=False)
else:
  df = pd.read_csv("datasets/ExInSeqs_3k.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

## Loading Model

In [None]:
# From Scratch
model = SpliceGPT(checkpoint="gpt2", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="gpt-001")

In [4]:
# From Scratch
model = SpliceBERT(checkpoint="bert-base-uncased", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="bert-001")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# From Scratch
model = SpliceDNABERT(checkpoint="zhihan1996/DNA_bert_6", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="dnabert-001")

In [None]:
# From Checkpoint
model = SpliceGPT(checkpoint="gpt-001", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="gpt-001")

In [None]:
# From Checkpoint
splicebert = SpliceBERT(checkpoint="bert-001", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="bert-001")

In [None]:
# From Checkpoint
splicednabert = SpliceDNABERT(checkpoint="dnabert-001", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="dnabert-001")

## Setting Train/Test DataLoader

In [5]:
data_config = {
  "flanks_len": 10,
  "feat_hide_prob": 0.4,
}

In [6]:
model.add_train_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
},  batch_size=32, sequence_len=128, train_percentage=0.8, data_config=data_config)

## Updating Test DataLoader

In [9]:
# This can be used for generalization tests with completely new data

model.add_test_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
}, batch_size=32, sequence_len=128, data_config=data_config)

## Training Pipeline

In [7]:
model.train(lr=0.00005, epochs=1, save_at_end=False, evaluation=False, keep_best=False, save_freq=1)

Training Epoch 1/1:   0%|          | 0/75 [00:00<?, ?it/s]

## Eval Pipeline

In [10]:
model.evaluate()

Evaluating:   0%|          | 0/94 [00:00<?, ?it/s]

Evaluation complete
Average loss: 0.6982
Overall Accuracy: 0.5000
Exon accuracy: 1.0000
Intron accuracy: 0.0000


## Prediction Pipeline

In [11]:
# Predict Single

idx = random.randint(0, len(sequence))

print(f"Sequence to Predict: {sequence[idx]}")
print(f"True Ground: {label[idx]}")
print(f"Gene: {gene[idx]}")
print(f"Organism: {organism[idx]}")
print(f"Flank Before: {flank_before[idx]}")
print(f"Flank After: {flank_after[idx]}")
pred = model.predict_single({"sequence": sequence[idx]})
print(f"Prediction: {pred}")

Sequence to Predict: CTGCAAGAATTAGATCATAGTTCATTAAAACACAATCTTTACCCTCCACCTAC
True Ground: intron
Gene: 
Organism: Rhynchosporium commune
Flank Before: GGGTTATTAT
Flank After: AAGGATTCCG
Prediction: exon
