<a href="https://colab.research.google.com/github/MiqG/ColabSplice/blob/main/notebooks/Borzoi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Borzoi**

## Model information
- original repository: https://github.com/calico/borzoi
- original paper: https://doi.org/10.1038/s41588-024-02053-6
- pytorch version:
  - repository: https://github.com/johahi/borzoi-pytorch
  - paper: https://doi.org/10.1101/2024.12.18.629121

## Tips and Instructions
- click the little ▶ play icon to the left of each cell below. Or Runtime > Run all to give it a try. This will produce a zip file called `test_SMN2.zip`, containing:
  - splice site strength predictions:
    - table: `test_SMN2-scores.tsv`
    - plot: `test_SMN2-coverage-line.pdf`
  - bigwig files that can be visualized through [IGV](https://igv.org/):
    - `test_SMN2-RNA:adipose_tissue.bw`
    - `test_SMN2-RNA:adrenal_gland.bw`
    - `test_SMN2-RNA:bladder.bw`
    - `test_SMN2-RNA:blood.bw`
    - `test_SMN2-RNA:blood_vessel.bw`
    - `test_SMN2-RNA:brain.bw`
    - `test_SMN2-RNA:breast.bw`
    - `test_SMN2-RNA:cervix_uteri.bw`
    - `test_SMN2-RNA:colon.bw`
    - `test_SMN2-RNA:esophagus.bw`
    - `test_SMN2-RNA:fallopian_tube.bw`
    - `test_SMN2-RNA:heart.bw`
    - `test_SMN2-RNA:kidney.bw`
    - `test_SMN2-RNA:liver.bw`
    - `test_SMN2-RNA:lung.bw`
    - `test_SMN2-RNA:muscle.bw`
    - `test_SMN2-RNA:nerve.bw`
    - `test_SMN2-RNA:ovary.bw`
    - `test_SMN2-RNA:pancreas.bw`
    - `test_SMN2-RNA:pituitary.bw`
    - `test_SMN2-RNA:prostate.bw`
    - `test_SMN2-RNA:salivary_gland.bw`
    - `test_SMN2-RNA:skin.bw`
    - `test_SMN2-RNA:small_intestine.bw`
    - `test_SMN2-RNA:spleen.bw`
    - `test_SMN2-RNA:stomach.bw`
    - `test_SMN2-RNA:testis.bw`
    - `test_SMN2-RNA:thyroid.bw`
    - `test_SMN2-RNA:uterus.bw`
    - `test_SMN2-RNA:vagina.bw`
     
- The notebooks runs perfectly without GPU.



In [None]:
%%time
#@title ## Install packages (~1min 30s)
import os, time
if not os.path.isfile("finished_install"):
  # genomic file handling
  os.system("pip install -q pyfaidx==0.8.1.1 pyranges pyBigWig pyrle biopython")

  # install Borzoi
  os.system("pip install -q borzoi-pytorch")

  # install visualization
  os.system("pip install -q igv-notebook")

  os.system("touch finished_install")

# imports
import pyfaidx
import pyranges as pr
import hashlib, re, os
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import igv_notebook
import shutil
from google.colab import files
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from borzoi_pytorch import Borzoi
from borzoi_pytorch.config_borzoi import BorzoiConfig
from borzoi_pytorch.pytorch_borzoi_helpers import predict_tracks
import random

# notebook-wide variables
MAX_SEQ_LEN = 524288

targets = pd.read_table("https://github.com/johahi/borzoi-pytorch/raw/refs/heads/main/borzoi_pytorch/precomputed/targets.txt")
targets = targets.loc[targets["identifier"].str.contains("GTEX")]
TISSUES = targets

GENOME_FASTA_URLS = {
    "homo_sapiens (hg38-GencodeV44)": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz"
}

# functions
def install_genome(species, genome_fasta_urls, fasta_file):
  if not os.path.isfile("finished_install_genome"):
    print(f"Downloading genome for {species}...")
    os.system(f"wget {genome_fasta_urls[species]} -O {fasta_file}")

    print("Uncompressing genome fasta...")
    os.system(f"gunzip {fasta_file}")

    os.system("touch finished_install_genome")

def get_hash(x):
  return hashlib.sha1(x.encode()).hexdigest()

def save_fasta(sequence, filename, header="sequence"):
  record = SeqRecord(Seq(sequence), id=header, description="")
  with open(filename, "w") as f:
    SeqIO.write(record, f, "fasta")

def reverse_complement(seq, padval_seq="N"):
  complement = str.maketrans(padval_seq+"ACGTacgt", padval_seq+"TGCAtgca")
  return seq.translate(complement)[::-1]

def load_models():
  cfg = BorzoiConfig.from_pretrained('johahi/borzoi-replicate-0')
  cfg.return_center_bins_only = False
  models = Borzoi.from_pretrained('johahi/borzoi-replicate-0', config=cfg)
  return [models]

def dna_1hot(
    seq: str, seq_len: int = None, n_uniform: bool = False, n_sample: bool = False
):
  """Convert a DNA sequence to a 1-hot encoding.

  Args:
    seq (str): DNA sequence.
    seq_len (int): length to extend/trim sequences to.
    n_uniform (bool): represent N's as 0.25, forcing float16,
    n_sample (bool):  sample ACGT for N

  Returns:
    seq_code (np.array): 1-hot encoding of DNA sequence.
  """
  if seq_len is None:
    seq_len = len(seq)
    seq_start = 0
  else:
    if seq_len <= len(seq):
      # trim the sequence
      seq_trim = (len(seq) - seq_len) // 2
      seq = seq[seq_trim : seq_trim + seq_len]
      seq_start = 0
    else:
      seq_start = (seq_len - len(seq)) // 2

  seq = seq.upper()

  # map nt's to a matrix len(seq)x4 of 0's and 1's.
  if n_uniform:
    seq_code = np.zeros((seq_len, 4), dtype="float16")
  else:
    seq_code = np.zeros((seq_len, 4), dtype="bool")

  for i in range(seq_len):
    if i >= seq_start and i - seq_start < len(seq):
      nt = seq[i - seq_start]
      if nt == "A":
        seq_code[i, 0] = 1
      elif nt == "C":
        seq_code[i, 1] = 1
      elif nt == "G":
        seq_code[i, 2] = 1
      elif nt == "T":
        seq_code[i, 3] = 1
      else:
        if n_uniform:
          seq_code[i, :] = 0.25
        elif n_sample:
          ni = random.randint(0, 3)
          seq_code[i, ni] = 1

  return seq_code

def one_hot_encode(seq):
  seq = dna_1hot(seq)
  return seq

def get_margins(seq_len):
  margin = (MAX_SEQ_LEN - (seq_len)) / 2
  if margin % 1 == 0:
    margin_upstream = margin
    margin_downstream = margin
  else:
    margin_upstream = int(margin)
    margin_downstream = int(margin)+1

  assert MAX_SEQ_LEN == margin_upstream + margin_downstream + seq_len

  return int(margin_upstream), int(margin_downstream)

def predict(sequence, strand, models):

  print("Computing prediction across different tissues...")

  if strand == '-':
    sequence = reverse_complement(sequence)

  sequence = one_hot_encode(sequence).T
  sequence = torch.from_numpy(sequence).float().unsqueeze(0)
  if torch.cuda.is_available():
    sequence = sequence.cuda()

  scores = []
  for fold_idx, model in enumerate(models):
    model.eval()
    with torch.no_grad():
      # predict
      output = model(sequence).cpu().detach()
      output = output.numpy(force=True) # 1 x 7611 features x 6144 seq_bins
      output = output[0,targets.index,:] # subset features of interest
      scores.append(output)

  if strand == '-':
    # reverse the bins
    scores = [o[:,::-1] for o in scores]

  # prepare outputs
  scores = [pd.DataFrame(score.T, columns=TISSUES["identifier"]) for score in scores]
  for idx in range(len(scores)):
    scores[idx] = pd.merge(
        scores[idx].T, TISSUES[["identifier","description"]],
        left_index=True, right_on="identifier", how="left"
    ).drop(
        "identifier",
        axis=1
    ).groupby(
        "description"
    ).mean().T

    scores[idx] = scores[idx].loc[scores[idx].index.repeat(32)].reset_index(drop=True)

  return scores

In [None]:
#@title ##run **Borzoi**
%%time

# user inputs
# @markdown Run settings
jobname = "test_SMN2" #@param {type:"string"}
jobname = re.sub(r'\W+', '', jobname)[:50]

predict_from = "genomic_coordinates" #@param ["custom_sequence","genomic_coordinates"]

download_outputs = True #@param {type:"boolean"}
igv_view = True #@param {type:"boolean"}

# @markdown ---

if predict_from == "genomic_coordinates":
  # @markdown Genomic coordinates (NOTE: requires installing genome at first run (~3min)):
  # coordinates
  chrom = "chr5" #@param {type:"string"}
  start = 70049669 #@param {type:"integer"}
  end = 70077595 #@param {type:"integer"}
  strand = "+" #@param ["-","+"]
  margin_upstream, margin_downstream = get_margins(end - start)

  # genome
  species = "homo_sapiens (hg38-GencodeV44)" #@param ["homo_sapiens (hg38-GencodeV44)"]
  fasta_file = os.path.basename(GENOME_FASTA_URLS[species])
  install_genome(species, GENOME_FASTA_URLS, fasta_file)
  fasta_file = fasta_file.replace(".gz","")

  print("Loading genome data...")
  genome = pyfaidx.Fasta(fasta_file)
  sequence = genome[chrom][(start-margin_upstream):(end+margin_downstream)].seq

elif predict_from == "custom_sequence":
  # @markdown ---
  # @markdown Custom pre-mRNA sequence (try copy-pasting [FAS sequence](https://www.ncbi.nlm.nih.gov/nuccore/NC_000010.11?report=fasta&from=88964050&to=89017059)):
  sequence = "" # @param {"type":"string","placeholder":"Paste nucleotide sequence here"}
  sequence = sequence.replace(" ","")
  chrom = f"{jobname}"
  strand = "+"
  margin_upstream, margin_downstream = get_margins(len(sequence))
  sequence = margin_downstream*"N" + sequence + margin_upstream*"N"

  # save custom sequence as fasta
  fasta_file = f"{jobname}.fa"
  save_fasta(sequence[512:-512], fasta_file, header=f"{jobname}")
  genome = pyfaidx.Fasta(fasta_file)
  start = 1
  end = len(sequence) - margin_upstream - margin_downstream

sequence = sequence.upper().replace("U","T")
assert set(sequence).issubset({"A", "C", "T", "G", "N"})

ID = jobname+"_"+get_hash(sequence)[:5]

# load model
models = load_models()
if torch.cuda.is_available():
  [model.cuda() for model in models]

# make prediction
print(f"Using {predict_from} to make prediction.")
scores = predict(sequence, strand, models)

# add more info
positions = np.arange(start-margin_upstream+512, end+margin_downstream-512)
if strand=="-":
  sequence = reverse_complement(sequence)[::-1]

for idx in range(len(scores)):
  scores[idx]["chrom"] = chrom
  scores[idx]["position"] = positions
  scores[idx]["sequence"] = list(sequence[512:-512])

# save
scores = scores[0]
filename_scores = f"{jobname}-scores.tsv"
scores.to_csv(filename_scores, index=False, sep="\t")

In [None]:
#@title PDF plot

X = scores.melt(id_vars=["chrom","position","sequence"], var_name="tissue")

g = sns.relplot(data=X, x="position", y="value", hue="tissue", row="tissue", kind="line", height=1.5, aspect=10)
g.set_xlabels("")
g.set_ylabels("", clear_inner=False)
g.figure.text(0.5, 0.02, "Nucleotide Position", ha="center", fontsize=10)
g.figure.text(0.0, 0.5, "Predicted Splice Site Strength", va="center", rotation="vertical", fontsize=10)

filename_plt = f"{jobname}-coverage-line.pdf"
g.figure.savefig(filename_plt, format="pdf", bbox_inches="tight")

plt.show()

In [None]:
#@title Convert scores to BigWig (and IGV view)

# make bigwigs
for tissue in TISSUES["description"].unique():
  ranges = scores[["chrom","position",tissue]].copy()
  ranges.columns = ["Chromosome","Start","Score"]
  ranges["Chromosome"] = ranges["Chromosome"]
  ranges["Start"] = ranges["Start"] - 1
  ranges["End"] = ranges["Start"] + 1
  ranges["Name"] = "U0"
  ranges["Score"] = ranges["Score"]
  ranges["Strand"] = strand
  ranges = ranges[["Chromosome","Start","End","Score","Name","Strand"]].copy()
  ranges = pr.PyRanges(ranges)
  chromosome_sizes = {chrom: len(genome[chrom]) for chrom in genome.keys()}
  ranges.to_bigwig(f"{jobname}-{tissue}.bw", chromosome_sizes, value_col="Score", rpm=False)
  # ranges.to_bigwig(dryrun=True, value_col="Score", rpm=False)

# make tracks
if igv_view:
  igv_notebook.init()

  if predict_from=="genomic_coordinates":
    b = igv_notebook.Browser({
        "reference": {
            "name": f"{species}",
            "fastaPath": f"./{fasta_file}",
            "indexPath": f"./{fasta_file}.fai"
        },
        "locus": f"{chrom}:{start}-{end}"
    })

    b.load_track({
          "name": "Gencode v44",
          "url": "https://igv-genepattern-org.s3.amazonaws.com/genomes/hg38/gencode.v44.basic.annotation.gff3.gz",
          "indexURL": "https://igv-genepattern-org.s3.amazonaws.com/genomes/hg38/gencode.v44.basic.annotation.gff3.gz.tbi",
          "order": 1,
          "format": "gff3",
          "type": "annotation"
    })

    b.load_track({
          "name": "Refseq Select",
          "format": "refgene",
          "url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/ncbiRefSeqSelect.txt.gz",
          "indexed": False,
          "order": 2,
          "infoURL": "https://www.ncbi.nlm.nih.gov/gene/?term=$$",
          "type": "annotation",
          "height": 70
    })

  else:
    b = igv_notebook.Browser({
      "reference": {
          "name": "custom",
          "fastaPath": f"./{fasta_file}",
          "indexPath": f"./{fasta_file}.fai"
      },
      "locus": f"{chrom}:{start}-{end}"
    })

  colors = sns.color_palette(n_colors=len(TISSUES["description"].unique())).as_hex()[:]
  for tissue, color in zip(TISSUES["description"].unique(), colors):
    b.load_track({
        "name": tissue,
        "path": f"./{jobname}-{tissue}.bw",
        "format": "bigwig",
        "type": "wig",
        "color": color,
        "graphType": "line",
    })

  b.zoom_in()

In [None]:
#@title Download data

# prep outputs
## zip into a folder
output_folder = f"{jobname}-outputs"
os.makedirs(output_folder, exist_ok=True)

shutil.copy(filename_scores, os.path.join(output_folder, filename_scores))
shutil.copy(filename_plt, os.path.join(output_folder, filename_plt))
for tissue in TISSUES["description"].unique():
    shutil.copy(f"{jobname}-{tissue}.bw", os.path.join(output_folder, f"{jobname}-{tissue}.bw"))

shutil.make_archive(output_folder, 'zip', output_folder)

## download folder if user wants to
if download_outputs:
  files.download(f"{output_folder}.zip")