In [2]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import numpy as np
from nn import nn, preprocess, io
import matplotlib.pyplot as plt

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
positives = io.read_text_file("data/rap1-lieb-positives.txt")
negatives = io.read_fasta_file("data/yeast-upstream-1k-negative.fa")

In [5]:
len(positives)

137

In [6]:
seqs = list(positives) + list(negatives)
len(seqs)

3300

In [7]:
labels = [True] * len(positives) + [False] * len(negatives)
len(labels)

3300

In [8]:
# Create array for sequences and labels.
seqs_array = np.array(seqs)
labels_array = np.array(labels)

In [9]:
# Get positive and negative sequence indices based on True and False labels.
pos_seqs = np.where(labels_array == True)[0]
neg_seqs = np.where(labels_array == False)[0]

# Initialize sequences and sample lists.
sampled_seqs = []
sampled_labels = []

#len(pos_seqs) + len(neg_seqs)
neg_seqs

array([ 137,  138,  139, ..., 3297, 3298, 3299])

In [10]:
#len(pos_seqs) + len(neg_seqs)
for seq_idx in range( 10 ):
    print('Loop ', seq_idx)
    # Set float that represents a value above or below 0.5 probability.
    p = np.random.uniform()
    # Sample from positive sequences.
    if p < 0.5:
        print('Sample positive')
        sample = int( np.random.uniform(0, len(pos_seqs)) ) # Select random indices of positive samples.
        print('Sample: ', sample)
        print('Pos_seqs[sample]: ', pos_seqs[sample])
        print('labels[ pos_seqs[sample]: ', labels[ pos_seqs[sample]])
        print('seqs[ pos_seqs[sample]: ', seqs[ pos_seqs[sample]])
        sampled_labels += [ labels[ pos_seqs[sample] ] ]
        sampled_seqs += [ seqs[ pos_seqs[sample] ] ]
        print('Sampled_labels: ', sampled_labels)
        print('Sampled_seqs: ', sampled_seqs)
    # Sample from negative sequences.
    else:
        print('Sample negative')
        sample = int( np.random.uniform(0, len(neg_seqs)) ) # Select random indices of negative samples.
        print('Sample: ', sample)
        print('Neg_seqs[sample]: ', neg_seqs[sample])
        print('labels[ neg_seqs[sample]: ', labels[ neg_seqs[sample]])
        print('seqs[ neg_seqs[sample]: ', seqs[ neg_seqs[sample]])
        sampled_labels += [ labels[ neg_seqs[sample] ] ]
        sampled_seqs += [ seqs[ neg_seqs[sample] ] ]
        print('Sampled_labels: ', sampled_labels)
        print('Sampled_seqs: ', sampled_seqs)

#sampled_seqs, sampled_labels

Loop  0
Sample positive
Sample:  100
Pos_seqs[sample]:  100
labels[ pos_seqs[sample]:  True
seqs[ pos_seqs[sample]:  GAACCCAAACATTATAG
Sampled_labels:  [True]
Sampled_seqs:  ['GAACCCAAACATTATAG']
Loop  1
Sample negative
Sample:  2909
Neg_seqs[sample]:  3046
labels[ neg_seqs[sample]:  False
seqs[ neg_seqs[sample]:  TGTCACAGAAATCTGCTACAAAAACTGCCGCTACATGCTCTGCAGTTGGTGCAACAGGAGGGTTTAATAAGACCGCATGAACTGCATCTAGCGCTCTCTTTCTAACTTTTGGTCTTACATCTAAAGAAAGTTCAAGTATACCCTGCAAACCTCTTTTTGGAGTGACGTTTAGATCATAAGTATTGTTCCAGGCTTGTGCATCTTGAGCAATAAGAAGAGATTCTAAACAACCAATAGCAGCTCTAATTAGAGGGGCATTAGCCTTTTCTGCAGTGATACATGGGGCAATTTTCGTTAGGATTTCAGAAAACTTCGATCTTAATAATACCTTAGGCGAATAATGAAATATCAGATCTAGAAGATATGTAGATGAGGAAGCCAATTGAATATCTTTTATTTCATGAGTAGATGGGTCTACTGCTTGATCCAATAGTGACATAAAGGAAATGATATAATTTACAATGTTTTTTGAAACATCATTTGTAGCTTGACCTGCAATATTTTCCTCAACGGCAGTTAATATTATAGCAATATGCTTTTGATTCTCCAATTTAGAGTTTACTTGGGAACGAATTTTAGCCAGCTTGTCCTCCAGCTCTAAAAGAAAAGCAACTTTGTCTTGATCCATGATGAACTCTGAGATTTGCTGATTCAATTCGCGAGATGCTGTTCAAAATCTAGAAA

In [11]:
sampled_seqs, sampled_labels = preprocess.sample_seqs(seqs, labels)

In [12]:
len(sampled_seqs)

3300

In [13]:
len(sampled_labels)

3300

In [34]:
[sampled_seqs[2]]
sampled_set = set(sampled_seqs[2])
alphabet = set( ('A', 'T', 'C', 'G') )
sampled_set

{'AGACCCATACATCATGA'}

In [33]:
[sampled_seqs[2]]

['AGACCCATACATCATGA']

In [26]:
len(sampled_set)

4

In [25]:
len(alphabet)

4

In [37]:
sampled_seqs[1:10]

['ATACGACTGCGTTTAACGATTCCTGTGTAAACCTGAGTCTTTTTGATGCTCGGTTTGAGAGGAAAAATCCACATTGATCTCAGAATATATCCAAATGGATAAATTATAAATTTACCAATAACAGTAATTATGTGTCAGTTTTAATACCCAACCAATTGATTCCTCTAAATGATACCGTACCAAATGACATGTTTGCCACCGATTATAAAACTGGGGTTTATGAGCGATGGTTAAGGAGGATTAAGACAAAGTGCTCATCCGCTTACAAGATCAGTACTAGCGTGTACCAGTCTATTTAAACTGGAATGATCACCCCAAAGGGAACGCACGATGCTGTGGCCAAGTTTCAAAAAACTGACCTGCATCAAGATCTCGATTACATCGTACTGCAACAACGAAGAACACAGCTAGAGACGCTTATTAACGAAAGAGAATCTTTTGTCAAGAATCTGTGTTCTCTTTTCCACAAAATTCAAAATACCAAGAATTACCAGGAATTTGTTGATGTATTGGCGGAGAATAGGGATTTACTGCGAGAAATTTTCACTGTAGAGAACGGATTTCAAAAGCAAAAATGGATTAGCAACGACGACATTCCCCAGATAGACTGGGACAAGTTTGCCCTAGATATCAACGCTTATATAGCAGAGAACGATCAATTGTTGGCTTTGTATGAAGATGGCTTATTATGATCAAATAGCGGCTATGGACATTTTTAAACATATATTTATATATATATACATATATGCGTATATTGCATAAATCACAAAGAAGAACAACGCCCTAGATATAGTGACCCAAAATATTATGTTTAAGTTACTGGTTGGGGTTCATGTACATTTTTCACTATCTTTTAATTCCAAATGTGGATTGTGCTTCTGCAGATTTTGTTGCCTAGTATTCTGTGATGGAAAAAATTGCCCCGATGAGATATAAAATGAGCTAGCCCCCTAACCAAAGAAAGCAGGATTGGTTAGTACATAGAGAAACCAAAGCTG

In [38]:
sample_encode = preprocess.one_hot_encode_seqs(sampled_seqs[1:10])
sample_encode

ATACGACTGCGTTTAACGATTCCTGTGTAAACCTGAGTCTTTTTGATGCTCGGTTTGAGAGGAAAAATCCACATTGATCTCAGAATATATCCAAATGGATAAATTATAAATTTACCAATAACAGTAATTATGTGTCAGTTTTAATACCCAACCAATTGATTCCTCTAAATGATACCGTACCAAATGACATGTTTGCCACCGATTATAAAACTGGGGTTTATGAGCGATGGTTAAGGAGGATTAAGACAAAGTGCTCATCCGCTTACAAGATCAGTACTAGCGTGTACCAGTCTATTTAAACTGGAATGATCACCCCAAAGGGAACGCACGATGCTGTGGCCAAGTTTCAAAAAACTGACCTGCATCAAGATCTCGATTACATCGTACTGCAACAACGAAGAACACAGCTAGAGACGCTTATTAACGAAAGAGAATCTTTTGTCAAGAATCTGTGTTCTCTTTTCCACAAAATTCAAAATACCAAGAATTACCAGGAATTTGTTGATGTATTGGCGGAGAATAGGGATTTACTGCGAGAAATTTTCACTGTAGAGAACGGATTTCAAAAGCAAAAATGGATTAGCAACGACGACATTCCCCAGATAGACTGGGACAAGTTTGCCCTAGATATCAACGCTTATATAGCAGAGAACGATCAATTGTTGGCTTTGTATGAAGATGGCTTATTATGATCAAATAGCGGCTATGGACATTTTTAAACATATATTTATATATATATACATATATGCGTATATTGCATAAATCACAAAGAAGAACAACGCCCTAGATATAGTGACCCAAAATATTATGTTTAAGTTACTGGTTGGGGTTCATGTACATTTTTCACTATCTTTTAATTCCAAATGTGGATTGTGCTTCTGCAGATTTTGTTGCCTAGTATTCTGTGATGGAAAAAATTGCCCCGATGAGATATAAAATGAGCTAGCCCCCTAACCAAAGAAAGCAGGATTGGTTAGTACATAGAGAAACCAAAGCTGTT

[[1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
