In [18]:
import os
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
import re
import oritelib as orite
import matplotlib.pyplot as plt

In [41]:
'''
Input: file path to a genbank file
Output: a tuple where the first element contains a list of tuples of the non-coding regions on both strands, 
the second being a list of the same regions but with each base position as an element in the list.

The function utilizes various other functions contained in oritelib.
'''

def genbank_to_non_coding_intervals(file_path):
    
    raw_p = raw_positions_strings_from_genbank(file_path)
    
    raw_plus, raw_neg = split_strands(raw_p)
    
    
    valid_plus = make_into_valid_pos(raw_plus)
    valid_neg = make_into_valid_pos(raw_neg)

    
    non_coding_plus = get_non_coding_intervals(valid_plus)
    non_coding_neg = get_non_coding_intervals(valid_neg)
    
    
    
    true_nc_positions = get_true_nc_positions(non_coding_plus, non_coding_neg)
    true_nc_intervals = position_list_to_intervals(true_nc_positions)
    
    
    
    
    
    return true_nc_intervals, true_nc_positions


575115


In [30]:
'''
Input two lists of non-coding regions. 
Output set of all true non-coding positions

HELPER FUNCTION 1: Input a list of intervals, output a set of all positions in intervals

'''
def interval_list_to_position_set(interval_list):
    pos_set = set()
    
    for interval in interval_list:
        start = interval[0]
        stop = interval[1]
        
        interval_range = list(range(start,stop+1))
        pos_set.update(interval_range)
        
    return pos_set

        

In [31]:
def get_true_nc_positions(nc_plus_intervals, nc_neg_intervals):
    nc_plus_set = interval_list_to_position_set(nc_plus_intervals)
    nc_neg_set = interval_list_to_position_set(nc_neg_intervals)
    
    intersection_set = nc_plus_set.intersection(nc_neg_set)
    intersect_set_list = list(intersection_set)
    intersect_set_list.sort()
    return intersect_set_list
    
    

In [34]:
def position_list_to_intervals(pos_list):
    
    crap_bag = []
    
    current_start = pos_list[0]
    current_stop = -1
    
    for i in  range(1, len(pos_list)-1):
        
        if pos_list[i-1] +1 == pos_list[i] and  pos_list[i]+1!=pos_list[i+1]:
            current_stop = pos_list[i]
            
            crap_bag.append((current_start, current_stop))
            
            current_start = pos_list[i+1]
        
        if pos_list[i-1] +1 != pos_list[i] and  pos_list[i]+1==pos_list[i+1]:
            current_start = pos_list[i]
            
    print(i)
    if pos_list[i+1] == current_start+1:
        current_stop = pos_list[i+1]
        crap_bag.append((current_start, current_stop))
    
    
    
    return crap_bag
            
        
        
        
    
    
    
    
    

In [39]:
def interval_list_to_range_list(interval_list):

    crap_list = []
    for touple in interval_list:
        print(touple)
        x = list(range(touple[0], touple[1]+1))
        crap_list.append(x)
        
    return crap_list 



In [45]:
recs = [rec for rec in SeqIO.parse("test_data/eco_genbank.gb", "genbank")]

In [46]:
for rec in recs:
    print(len(rec.seq))

4641652
