In [1]:
import pandas as pd
from SpliceGPT import SpliceGPT

from genbank_dataset_extraction import splicing_sites_extraction

## Datasets Creation

In [None]:
splicing_sites_extraction("datasets/ExInSeqs.gb", "datasets/ExinSeqs_11M.csv")

In [None]:
df = pd.read_csv("datasets/ExInSeqs_11M.csv", keep_default_na=False)

In [4]:
shuffled_df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df_exons = shuffled_df[shuffled_df["label"] == "exon"]
df_introns = shuffled_df[shuffled_df["label"] == "intron"]

In [None]:
df_exons_small = df_exons[df_exons["sequence"].str.len() < 128]
df_introns_small = df_introns[df_introns["sequence"].str.len() < 128]

In [None]:
print(len(df_exons))
print(len(df_exons_small))
print(len(df_introns))
print(len(df_introns_small))

In [None]:
df_3k = pd.concat([df_exons.sample(n=1500), df_introns.sample(n=1500)])
df_3k = df_3k.sample(frac=1).reset_index(drop=True)
print(f"Exons: {len(df_3k[df_3k["label"] == "exon"])}")
print(f"Introns: {len(df_3k[df_3k["label"] == "intron"])}")
print(f"Total Len: {len(df_3k)}")

df_3k.to_csv("datasets/ExInSeqs_3k.csv", index=False)

In [None]:
df_3k_small = pd.concat([df_exons_small.sample(n=1500), df_introns_small.sample(n=1500)])
df_3k_small = df_3k_small.sample(frac=1).reset_index(drop=True)
print(f"Exons: {len(df_3k_small[df_3k_small["label"] == "exon"])}")
print(f"Introns: {len(df_3k_small[df_3k_small["label"] == "intron"])}")
print(f"Total Len: {len(df_3k_small)}")

df_3k_small.to_csv("datasets/ExInSeqs_3k_small.csv", index=False)

In [None]:
df_100k = pd.concat([df_exons.sample(n=50000), df_introns.sample(n=50000)])
df_100k = df_100k.sample(frac=1).reset_index(drop=True)
print(f"Exons: {len(df_100k[df_100k["label"] == "exon"])}")
print(f"Introns: {len(df_100k[df_100k["label"] == "intron"])}")
print(f"Total Len: {len(df_100k)}")

df_100k.to_csv("datasets/ExInSeqs_100k.csv", index=False)

In [None]:
df_100k_small = pd.concat([df_exons_small.sample(n=50000), df_introns_small.sample(n=50000)])
df_100k_small = df_100k_small.sample(frac=1).reset_index(drop=True)
print(f"Exons: {len(df_100k_small[df_100k_small["label"] == "exon"])}")
print(f"Introns: {len(df_100k_small[df_100k_small["label"] == "intron"])}")
print(f"Total Len: {len(df_100k_small)}")

df_100k_small.to_csv("datasets/ExInSeqs_100k_small.csv", index=False)

## Datasets To Use

In [9]:
# Load 11M Seqs Dataset

df = pd.read_csv("datasets/ExInSeqs_11M.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [13]:
# Load 100k Seqs Dataset

df = pd.read_csv("datasets/ExInSeqs_100k_small.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

In [2]:
# Load 30k from 100k Seqs Dataset

df = pd.read_csv("datasets/ExInSeqs_100k_small.csv", keep_default_na=False)

sequence = df.iloc[:30000, 0].tolist()
label = df.iloc[:30000, 1].tolist()
organism = df.iloc[:30000, 2].tolist()
gene = df.iloc[:30000, 3].tolist()
flank_before = df.iloc[:30000, 4].tolist()
flank_after = df.iloc[:30000, 5].tolist()

In [2]:
# Load 3k Seqs Dataset

df = pd.read_csv("datasets/ExInSeqs_3k_small.csv", keep_default_na=False)

sequence = df.iloc[:, 0].tolist()
label = df.iloc[:, 1].tolist()
organism = df.iloc[:, 2].tolist()
gene = df.iloc[:, 3].tolist()
flank_before = df.iloc[:, 4].tolist()
flank_after = df.iloc[:, 5].tolist()

## Loading Model

In [3]:
# From Scratch

splicegpt = SpliceGPT(checkpoint="gpt2", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="batatinha123")

In [3]:
# From Default Checkpoint

splicegpt = SpliceGPT(checkpoint="models/SpliceGPT", device="cuda", seed=1234, notification=True, logs_dir="logs", alias="SpliceGPT")

## Setting Train/Test DataLoader

In [4]:
splicegpt.add_train_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
}, sequence_len=256, flanks_len=10, batch_size=8, train_percentage=0.8, feat_hide_prob=0.2)

## Updating Test DataLoader

In [15]:
# This can be used for generalization tests with completely new data

splicegpt.add_test_data({
  "sequence": sequence,
  "label": label,
  "organism": organism,
  "gene": gene,
  "flank_before": flank_before,
  "flank_after": flank_after
}, sequence_len=256, flanks_len=10, batch_size=32, feat_hide_prob=0.8)

Detected a different test dataloader configuration of the one used during training. This may lead to suboptimal results.


## Training Pipeline

In [5]:
splicegpt.train(lr=0.0005, epochs=3, save_at_end=True, evaluation=True, keep_best=True, save_freq=1)

Training Epoch 1/3:   0%|          | 0/300 [00:00<?, ?it/s]

Validating:   0%|          | 0/75 [00:00<?, ?it/s]

Training Epoch 2/3:   0%|          | 0/300 [00:00<?, ?it/s]

Validating:   0%|          | 0/75 [00:00<?, ?it/s]

Training Epoch 3/3:   0%|          | 0/300 [00:00<?, ?it/s]

Validating:   0%|          | 0/75 [00:00<?, ?it/s]

Model Successful Saved at models/batatinha123


## Eval Pipeline

In [16]:
splicegpt.evaluate()

Evaluating:   0%|          | 0/94 [00:00<?, ?it/s]

Evaluation complete. Average loss: 0.3399
Overall Accuracy: 0.8050
Exon accuracy: 0.0000
Intron accuracy: 0.8050


## Prediction Pipeline

In [14]:
# Predict Single

idx = 8

print(f"Sequence to Predict: {sequence[idx]}")
print(f"True Ground: {label[idx]}")
print(f"Gene: {gene[idx]}")
print(f"Organism: {organism[idx]}")
print(f"Flank Before: {flank_before[idx]}")
print(f"Flank After: {flank_after[idx]}")
pred = splicegpt.predict_single({"sequence": sequence[idx]})
print(f"Prediction: {pred}")

Sequence to Predict: GTTTGTTTTTAGTTTTTTTTTTTTATTTTATTTAAAAATAAATAAATAAATATTAATATAG
True Ground: intron
Gene: 
Organism: Rotaria sp. Silwood2
Flank Before: CACCTTATTG
Flank After: ATGGCACCTG
Prediction: exon
