In [1]:
import os
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
import re

In [14]:
def genbank_to_non_coding_intervals(file_path):
    
    raw_p = raw_positions_strings_from_genbank(file_path)
    
    raw_plus, raw_neg = split_strands(raw_p)
    
    
    valid_plus = make_into_valid_pos(raw_plus)
    valid_neg = make_into_valid_pos(raw_neg)

    
    non_coding_plus = get_non_coding_intervals(valid_plus)
    non_coding_neg = get_non_coding_intervals(valid_neg)
    
    return non_coding_plus, non_coding_neg










In [15]:
b = genbank_to_non_coding_intervals("test_data/eco_genbank.gb")
b

([(0, 188),
  (256, 335),
  (5021, 5232),
  (5531, 8236),
  (9192, 9304),
  (9894, 10828),
  (11316, 12161),
  (14080, 14166),
  (15299, 15443),
  (16558, 17487),
  (18656, 18713),
  (19621, 21179),
  (21400, 21405),
  (22349, 22389),
  (25702, 25824),
  (27228, 27291),
  (28208, 28372),
  (29196, 29649),
  (30800, 30815),
  (34039, 34298),
  (34696, 42401),
  (43174, 43186),
  (44130, 44178),
  (45751, 45805),
  (47139, 47244),
  (49632, 49821),
  (50303, 57362),
  (58180, 58472),
  (59280, 70385),
  (71266, 71349),
  (72116, 77386),
  (77520, 77619),
  (78800, 84366),
  (85313, 85628),
  (87849, 88026),
  (89033, 89632),
  (91398, 91411),
  (100712, 100763),
  (105245, 105303),
  (106457, 106555),
  (107475, 107703),
  (108218, 108277),
  (110985, 111042),
  (111434, 113242),
  (113289, 113442),
  (114488, 118731),
  (120136, 122090),
  (122857, 123015),
  (125681, 125693),
  (127588, 127910),
  (129337, 131613),
  (134213, 134386),
  (134751, 137081),
  (138634, 141429),
  (141968, 

In [2]:
# get all sequence records for the specified genbank file

def raw_positions_strings_from_genbank(file_path):

    recs = [rec for rec in SeqIO.parse(file_path, "genbank")]


    raw_positions_str = []

    for rec in recs:
        feats = [feat for feat in rec.features if feat.type == "CDS"]
        for feat in feats:
            x = str((feat.location))
            raw_positions_str.append(x)
    
    
    return raw_positions_str


In [3]:
def split_strands(raw_pos_list):
    plus_pos = []
    neg_pos  = []
    
    for row in raw_pos_list:
        
        
        if row.find('+') == -1:
            neg_pos.append(row)
            
        if row.find('-') == -1:
            plus_pos.append(row)
    
    return plus_pos, neg_pos

#plus, neg = split_strands(raw_positions_str)

In [4]:
# print the CDS sequence feature summary information for each feature in each
# sequence record



In [5]:
#raw_positions_str



In [6]:
'''


# a FUNCTION THAT INPUTS A RAW POSINTION SGRING LIST 
AND OUTPUTS TWO LISTS, ONE FOR EACH STRAND. tHE CONTENTS OF BOTH ARE 
ALLSO RW STRINGS'
'''




"\n\n\n# a FUNCTION THAT INPUTS A RAW POSINTION SGRING LIST \nAND OUTPUTS TWO LISTS, ONE FOR EACH STRAND. tHE CONTENTS OF BOTH ARE \nALLSO RW STRINGS'\n"

In [7]:
#neg

In [8]:

def make_into_valid_pos(strand_pos_raw):
    
    interval_list = []
    
    for row in strand_pos_raw:
        m = re.findall(r'\d+', row)
        
        #print(len(m))
            
        start_p = int(m[0])
        stop_p = int(m[len(m)-1])
        interval_list.append((start_p, stop_p))
            
    return interval_list

In [9]:
#z = make_into_valid_pos(plus)

In [11]:


def get_non_coding_intervals(coding_intervals):
    non_coding_intervals = []
    
    
    
    if coding_intervals[0][0] != 0:
        
        ith_non_coding_start = 0
        ith_non_coding_stop = coding_intervals[0][0]-1
        
        non_coding_intervals.append((ith_non_coding_start, ith_non_coding_stop))
        
    
    for i in range(1, len(coding_intervals)):
        ith_non_coding_start = coding_intervals[i-1][1]+1
        ith_non_coding_stop = coding_intervals[i][0]-1
        
        non_coding_length = ith_non_coding_stop-ith_non_coding_start
        if non_coding_length>100:
            non_coding_intervals.append((ith_non_coding_start, ith_non_coding_stop))
        
    return non_coding_intervals

#z

In [None]:
#ecoli_nc_intervals = get_non_coding_intervals(z)

In [None]:
#z

In [12]:

def extract_seq_from_non_coding_intervals(non_coding_intervals, seq):
    
    non_coding_seqs = []
    
    for interval in non_coding_intervals:
        
        non_coding_seqs.append(seq[interval[0]:interval[1]])
        
    return non_coding_seqs
        



In [13]:
import oritelib as orite

In [None]:
#ecoli_seq = orite.seq_from_fasta('test_data/eco_k12.fasta')

In [None]:
#extract_seq_from_non_coding_intervals(ecoli_nc_intervals, ecoli_seq)

In [16]:
a = raw_positions_strings_from_genbank("test_data/eco_genbank.gb")
plus, neg = split_strands(a)
plus_valid = make_into_valid_pos(plus)
neg_valid = make_into_valid_pos(neg)
plus_non_cod = get_non_coding_intervals(plus_valid)

In [21]:
plus_non_cod = get_non_coding_intervals(plus_valid)

In [22]:
plus_valid

[(0, 255),
 (336, 2799),
 (2800, 3733),
 (3733, 5020),
 (5233, 5530),
 (8237, 9191),
 (9305, 9893),
 (10829, 11315),
 (12162, 14079),
 (14167, 15298),
 (15444, 16557),
 (17488, 18655),
 (18714, 19620),
 (21180, 21399),
 (21406, 22348),
 (22390, 25207),
 (25206, 25701),
 (25825, 26275),
 (26276, 27227),
 (27292, 28207),
 (28373, 29195),
 (29650, 30799),
 (30816, 34038),
 (34299, 34695),
 (42402, 43173),
 (43187, 44129),
 (44179, 45466),
 (45462, 45750),
 (45806, 47138),
 (47245, 47776),
 (47768, 49631),
 (49822, 50302),
 (57363, 58179),
 (58473, 59124),
 (59120, 59279),
 (70386, 71265),
 (71350, 72115),
 (77387, 77519),
 (77620, 78799),
 (84367, 85312),
 (85629, 87354),
 (87356, 87848),
 (88027, 89032),
 (89633, 90092),
 (90093, 91035),
 (91031, 91397),
 (91412, 93179),
 (93165, 94653),
 (94649, 96008),
 (96001, 97084),
 (97086, 98403),
 (98402, 99647),
 (99643, 100711),
 (100764, 102240),
 (102232, 103153),
 (103154, 103985),
 (103981, 105244),
 (105304, 106456),
 (106556, 107474),
 (1

In [23]:
plus_non_cod

[(256, 335),
 (5021, 5232),
 (5531, 8236),
 (9192, 9304),
 (9894, 10828),
 (11316, 12161),
 (14080, 14166),
 (15299, 15443),
 (16558, 17487),
 (18656, 18713),
 (19621, 21179),
 (21400, 21405),
 (22349, 22389),
 (25702, 25824),
 (27228, 27291),
 (28208, 28372),
 (29196, 29649),
 (30800, 30815),
 (34039, 34298),
 (34696, 42401),
 (43174, 43186),
 (44130, 44178),
 (45751, 45805),
 (47139, 47244),
 (49632, 49821),
 (50303, 57362),
 (58180, 58472),
 (59280, 70385),
 (71266, 71349),
 (72116, 77386),
 (77520, 77619),
 (78800, 84366),
 (85313, 85628),
 (87849, 88026),
 (89033, 89632),
 (91398, 91411),
 (100712, 100763),
 (105245, 105303),
 (106457, 106555),
 (107475, 107703),
 (108218, 108277),
 (110985, 111042),
 (111434, 113242),
 (113289, 113442),
 (114488, 118731),
 (120136, 122090),
 (122857, 123015),
 (125681, 125693),
 (127588, 127910),
 (129337, 131613),
 (134213, 134386),
 (134751, 137081),
 (138634, 141429),
 (141968, 142777),
 (144473, 144575),
 (145018, 145079),
 (146311, 146966),


In [24]:
plus_non_cod_seq = extract_seq_from_non_coding_intervals(plus_non_cod, orite.seq_from_fasta("test_data/eco_k12.fasta"))

In [26]:
print(len(plus_non_cod))

1711


In [25]:
print(plus_non_cod_seq)

['TACAGGAAACACAGAAAAAACCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATG', 'AAAAATGACAGGGAAAAGGAGAAATTCTCAATAAATGCGGTAACTTAGAGATTAGGATTGCGGAGAATAACAACCGCCGTTCTCACGAGTAATCTCCGGATATCGACCCATAACGGGCAATGATAAAAGGAGTAACCTGTGAAAAAGATGCAATTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCAGCACAGGCTGCGGA', 'AATATATTGAATCTGCATGCTTTTGTAGGCAGGATAAGGCGTTCACGCCGCATCCGGCTTGACTGCAAACTTAACGCTGCTCGTAGCGTTTAAACACCAGTTCGCCATTGCTGGAGGAATCTTCATCAAGAAGTAACCTTCGCTATTAAAACCAGTCAGTTGCTCTGGTTTGGTCAGCCGATTTTCAATAATGAAAGACTCATCAGACCGCGTGCTTTCTTAGCGTAGAAGCTGATGATCTTAAATTTGCCGTTCTTCTCATCGAGAACACCGGCTTGATAATCTCGGCATTCAATTTCTTCGGCTTCACCGATTTAAAATACTCATCTGACGCAGATTAATCACCACATTATCGCCTTGTGCTGCGAGCGCCTCGTTCAGCTTGTTGGTGATGATATCTCCCAGAATTGATACAGATCTTTCCCTCGGGCATTCTCAAGACGGATCCCCATTTCCAGACGATAAGGCTGCATAAATCGAGCGGGCGGAGTACGCCATACAAGCCGGAAAGCATTCGCAAATGCTGTTGGGCAAAATCGAATCGTCTTCGCTGAAGGTTTCGGCCTGCAAGCCGGTGTAGACATCACCTTTAAACGCCAGAATCGCCTGGGGGCATTCGCCGGCGTGAAATCTGGCTGCCAGTCATGAAAGCGAGCGGCGTTGATACCCGCCAGTTTGTGCTGATGCGCATCAGCGTGCT