In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Explore

## Biopython

In [3]:
from Bio import motifs 

In [3]:
from Bio.Seq import Seq 
DNA_motif = [Seq("AGCT"), Seq("TCGA"), Seq("AACT")] 
seq = motifs.create(DNA_motif) 
print(seq)

AGCT
TCGA
AACT



In [4]:
seq.counts

{'A': [2, 1, 0, 1], 'C': [0, 1, 2, 0], 'G': [0, 1, 1, 0], 'T': [1, 0, 0, 2]}

In [12]:
seq.pwm

{'A': (0.6666666666666666, 0.3333333333333333, 0.0, 0.3333333333333333),
 'C': (0.0, 0.3333333333333333, 0.6666666666666666, 0.0),
 'G': (0.0, 0.3333333333333333, 0.3333333333333333, 0.0),
 'T': (0.3333333333333333, 0.0, 0.0, 0.6666666666666666)}

In [13]:
seq.consensus

Seq('AACT')

In [14]:
seq.anticonsensus

Seq('CTAC')

In [18]:
seq.reverse_complement()

<Bio.motifs.Motif at 0x7f8cc320a990>

In [19]:
seq.instances

[Seq('AGCT'), Seq('TCGA'), Seq('AACT')]

In [20]:
seq.pssm

{'A': [1.4150374992788437, 0.4150374992788437, -inf, 0.4150374992788437],
 'C': [-inf, 0.4150374992788437, 1.4150374992788437, -inf],
 'G': [-inf, 0.4150374992788437, 0.4150374992788437, -inf],
 'T': [0.4150374992788437, -inf, -inf, 1.4150374992788437]}

In [23]:
seq.weblogo("test.logo")



In [7]:
with open("../eugene/dataload/motif/resources/sample.sites") as handle: 
    data = motifs.read(handle,"sites") 

In [9]:
print(data)

TF name	None
Matrix ID	None
Matrix:
        0      1      2      3      4      5
A:   2.00   5.00   0.00   0.00   0.00   1.00
C:   3.00   0.00   5.00   0.00   0.00   0.00
G:   0.00   1.00   1.00   6.00   0.00   5.00
T:   1.00   0.00   0.00   0.00   6.00   0.00





## PyJaspar

In [11]:
from pyjaspar import jaspardb

In [65]:
from eugene.dataload.motif._io import get_motifs_pyjaspar

In [74]:
motifs = get_motifs_pyjaspar(collection="CORE", release="JASPAR2018")

In [81]:
motifs?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'Bio.motifs' from '/Users/adamklie/miniconda3/envs/eugene/lib/python3.7/site-packages/Bio/motifs/__init__.py'>
[0;31mFile:[0m        ~/miniconda3/envs/eugene/lib/python3.7/site-packages/Bio/motifs/__init__.py
[0;31mDocstring:[0m  
Tools for sequence motif analysis.

Bio.motifs contains the core Motif class containing various I/O methods
as well as methods for motif comparisons and motif searching in sequences.
It also includes functionality for parsing output from the AlignACE, MEME,
and MAST programs, as well as files in the TRANSFAC format.


In [80]:
motifs.jaspar.Motif?

[0;31mInit signature:[0m
[0mmotifs[0m[0;34m.[0m[0mjaspar[0m[0;34m.[0m[0mMotif[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmatrix_id[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malphabet[0m[0;34m=[0m[0;34m'ACGT'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstances[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcounts[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcollection[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtf_class[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtf_family[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mspecies[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtax_group[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0macc[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m=

In [45]:
motif = motifs[0]

In [42]:
motif.pwm

{'A': (0.09529025191675794,
  0.1829134720700986,
  0.307776560788609,
  0.06133625410733844,
  0.008762322015334063,
  0.8148959474260679,
  0.04381161007667032,
  0.11732456140350878,
  0.9331140350877193,
  0.005488474204171241,
  0.36553238199780463,
  0.059275521405049394,
  0.013186813186813187,
  0.06153846153846154,
  0.11441144114411442,
  0.40924092409240925,
  0.09030837004405286,
  0.1288546255506608,
  0.44273127753303965),
 'C': (0.3187294633077766,
  0.1588170865279299,
  0.05366922234392114,
  0.8762322015334063,
  0.9890470974808324,
  0.014238773274917854,
  0.5783132530120482,
  0.47478070175438597,
  0.01206140350877193,
  0.0,
  0.003293084522502744,
  0.013172338090010977,
  0.0,
  0.008791208791208791,
  0.8063806380638063,
  0.014301430143014302,
  0.5308370044052864,
  0.35462555066079293,
  0.19933920704845814),
 'G': (0.0832420591456736,
  0.4534501642935378,
  0.49178532311062434,
  0.023001095290251915,
  0.0,
  0.07119386637458927,
  0.3658269441401972,
  

## Pymemesuite

In [82]:
from pymemesuite.common import MotifFile

In [83]:
from eugene.dataload.motif._io import load_meme_pymemesuite

In [86]:
mot = load_meme_pymemesuite("../eugene/dataload/motif/resources/CPEs.meme")

In [91]:
one = mot[0][0]

In [102]:
one

<pymemesuite.common.Motif at 0x7f8ccb936780>

In [94]:
test = one.frequencies

In [101]:
test[1]

pymemesuite.common.Array([0.1574698456407955, 0.39823201677801773, 0.1989122543297929, 0.24538588325139368])

## EUGENe Motif

In [2]:
import numpy as np
from eugene.dataload.motif._Motif import Motif, MotifSet

Global seed set to 13
2023-01-03 20:44:46.285091: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### From `MEME` file

In [3]:
from eugene.dataload.motif._io import read_meme

In [4]:
motif_set = read_meme("../eugene/dataload/motif/resources/CPEs.meme")

In [5]:
motif_set["TATA"].pfm

array([[0.1275, 0.3765, 0.1195, 0.3765],
       [0.1575, 0.3985, 0.199 , 0.2455],
       [0.249 , 0.303 , 0.197 , 0.251 ],
       [0.1235, 0.655 , 0.0755, 0.1455],
       [0.01  , 0.002 , 0.002 , 0.986 ],
       [0.968 , 0.    , 0.    , 0.032 ],
       [0.002 , 0.014 , 0.006 , 0.978 ],
       [0.992 , 0.    , 0.002 , 0.006 ],
       [0.653 , 0.012 , 0.002 , 0.333 ],
       [0.974 , 0.    , 0.008 , 0.018 ],
       [0.341 , 0.028 , 0.036 , 0.5955],
       [0.6955, 0.0815, 0.1195, 0.1035],
       [0.1255, 0.432 , 0.3165, 0.1255],
       [0.291 , 0.418 , 0.175 , 0.1155],
       [0.263 , 0.3445, 0.1755, 0.2175],
       [0.307 , 0.3085, 0.2365, 0.1475]])

### From BioPython motifs

In [7]:
from eugene.dataload.motif._io import _load_jaspar
from eugene.dataload.motif._convert import _from_biopython

In [8]:
motifs = _load_jaspar(collection="CORE", release="JASPAR2018")

In [9]:
len(motifs)

1404

In [10]:
motif_set = _from_biopython(motifs)

In [11]:
motif_set["P53762"].pfm.shape

(6, 4)

In [13]:
from eugene.dataload.motif._io import load_jaspar

In [15]:
load_jaspar(collection="CORE", release="JASPAR2022")

Motif with 1787 motifs

### From PyMemeSuite motifs

In [16]:
from eugene.dataload.motif._io import _read_meme
from eugene.dataload.motif._convert import _from_pymemesuite

In [17]:
motifs, bg = _read_meme("../eugene/dataload/motif/resources/CPEs.meme")

In [19]:
motifs, bg

([<pymemesuite.common.Motif at 0x7fdaa5ed7f50>,
  <pymemesuite.common.Motif at 0x7fdaa5ed7190>,
  <pymemesuite.common.Motif at 0x7fdaa5ed71e0>,
  <pymemesuite.common.Motif at 0x7fdaa5ed70f0>,
  <pymemesuite.common.Motif at 0x7fdaa5ed7140>,
  <pymemesuite.common.Motif at 0x7fdaa5ed7be0>],
 pymemesuite.common.Array([0.2817739364521271, 0.22202005595988805, 0.22887604224791547, 0.2673299653400693]))

In [20]:
motif_set = _from_pymemesuite(motifs)

In [21]:
motif_set["TATA"].pfm.shape

(16, 4)

In [22]:
motif_set["TATA"].pfm

array([[1.27560240e-01, 3.76439680e-01, 1.19542708e-01, 3.76457372e-01],
       [1.57469846e-01, 3.98232017e-01, 1.98912254e-01, 2.45385883e-01],
       [2.49012797e-01, 3.02968380e-01, 1.97012447e-01, 2.51006376e-01],
       [1.23623558e-01, 6.55158469e-01, 7.55976433e-02, 1.45620329e-01],
       [1.01061202e-02, 2.08591178e-03, 2.08858885e-03, 9.85719379e-01],
       [9.67732048e-01, 8.66927200e-05, 8.93697939e-05, 3.20918899e-02],
       [2.10924402e-03, 1.40812261e-02, 6.08702696e-03, 9.77722503e-01],
       [9.91722676e-01, 8.66927200e-05, 2.08858885e-03, 6.10204216e-03],
       [6.52855046e-01, 1.20820071e-02, 2.08858885e-03, 3.32974358e-01],
       [9.73729705e-01, 8.66927200e-05, 8.08624601e-03, 1.80973565e-02],
       [3.40806526e-01, 2.80617720e-02, 3.60573288e-02, 5.95074374e-01],
       [6.95338451e-01, 8.15548692e-02, 1.19542708e-01, 1.03563971e-01],
       [1.25623778e-01, 4.32134032e-01, 3.16624053e-01, 1.25618138e-01],
       [2.91141913e-01, 4.18132498e-01, 1.75108547e

### From PFM/Motif files

In [23]:
#filename = "../eugene/dataload/motif/resources/known.motifs"
filename = "../eugene/dataload/motif/resources/MA0037.3.pfm"

In [24]:
from eugene.preprocess import decode_seq
from eugene.preprocess._utils import _token2one_hot

In [25]:
from eugene.dataload.motif._io import read_motifs

In [26]:
motif_set = read_motifs("../eugene/dataload/motif/resources/MA0037.3.pfm", counts=True)

In [27]:
pfm = motif_set.motifs[">MA0037.3"].pfm

In [30]:
pfm.shape

(8, 4)

### To kernels

In [31]:
from eugene.dataload.motif._convert import _to_array

In [32]:
motifs = read_meme("../eugene/dataload/motif/resources/CPEs.meme")

In [50]:
tensor = _to_array(
    size=(10, 4, 16),
    motifs=motifs,
    convert_to_pwm=False
)

In [51]:
tensor[0].transpose(1, 0)

tensor([[0.1275, 0.3765, 0.1195, 0.3765],
        [0.1575, 0.3985, 0.1990, 0.2455],
        [0.2490, 0.3030, 0.1970, 0.2510],
        [0.1235, 0.6550, 0.0755, 0.1455],
        [0.0100, 0.0020, 0.0020, 0.9860],
        [0.9680, 0.0000, 0.0000, 0.0320],
        [0.0020, 0.0140, 0.0060, 0.9780],
        [0.9920, 0.0000, 0.0020, 0.0060],
        [0.6530, 0.0120, 0.0020, 0.3330],
        [0.9740, 0.0000, 0.0080, 0.0180],
        [0.3410, 0.0280, 0.0360, 0.5955],
        [0.6955, 0.0815, 0.1195, 0.1035],
        [0.1255, 0.4320, 0.3165, 0.1255],
        [0.2910, 0.4180, 0.1750, 0.1155],
        [0.2630, 0.3445, 0.1755, 0.2175],
        [0.3070, 0.3085, 0.2365, 0.1475]])

### To `MEME` file

To PFM/motifs files

# Models convs

In [201]:
from eugene.models._sequence_to_function import DeepSTARR 

2022-12-20 19:05:35.855539: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [203]:
model = DeepSTARR(input_len=249, output_dim=2)

None None regression
r2score
{}
R2Score()
{'regression': {}, 'binary_classification': {'task': 'binary'}, 'multiclass_classification': {'task': 'multiclass'}, 'multilabel_classification': {'task': 'multilabel'}}


In [204]:
def get_layer(model, layer_name):
    return [layer for layer in model.layers if layer.name == layer_name][0]

DeepSTARR(
  (train_metric): R2Score()
  (val_metric): R2Score()
  (test_metric): R2Score()
  (conv1d_tower): Conv1DTower(
    (layers): Sequential(
      (0): Conv1d(4, 246, kernel_size=(7,), stride=(1,), padding=same)
      (1): BatchNorm1d(246, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (4): Conv1d(246, 60, kernel_size=(3,), stride=(1,), padding=same)
      (5): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (8): Conv1d(60, 60, kernel_size=(5,), stride=(1,), padding=same)
      (9): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU()
      (11): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (12): Conv1d(60, 120, kernel_size=(3,), str

---