**Working with sequences**
Import the FASTA file

In [None]:
import Bio
from fileinput import close

#we read the FASTA file if the current line is a header we add that as a key to the dictionary if not we
#add the cuurent line as the value
try:
    f=open("dna2.fasta")
except IOError:
    print("File does not exist!!")


We have sequences and a corresponding ID to the FASTA file. Creating a dictionary where the id is the key and the corresponding sequence is the value

In [320]:

#initialize dictionary
seqs={}
for line in f:
#discard the newline at the end if any
    line=line.rstrip()
    if line[0]=='>': #or line.startswith('>')
        words=line.split()
        name=words[0][1:]
        seqs[name]=''
    else :
        seqs[name]= seqs[name]+line




No of entries in the dictionary

In [310]:
len(seqs)# no of dictionary items


18

Length of each sequence

In [339]:
#lengths of sequences
seq_length=[]
for name,seq in seqs.items():
    seq_length.append(len(seqs[name]))
   
min(seq_length)



115

Check whether there are multiple sequences that have the maximum or minimum lengths

In [290]:
#checking whether multiple sequences with max length
indices_max=[index for index, item in enumerate(seq_length) if item==max(seq_length)]
print(indices_max)

#checking whether multiple sequences with min length
indices_min=[index for index, item in enumerate(seq_length) if item==min(seq_length)]
print(indices_min)

[2]
[16]


Print sequence IDs of sequences corresponding to maximum or minimum lengths

In [69]:
min_len_seq=min(seqs.values(), key=len) # Id corresponding to the sequence with minimum length

gi|142022655|gb|EQ086233.1|521


In [70]:
max_len_seq=max(seqs.values(), key=len) # Id corresponding to the sequence with maximum length


gi|142022655|gb|EQ086233.1|323


Writing a function to identify open reading frames in a given sequence

In [None]:

#cheking whether a given sequence has a start or stop codon

def startstop_codon(dna, frame):
    dna = dna.lower() #make all the letter lower case
    for i in range(frame, len(dna), 3): #iterate through 3 letters at a time
        codon1 = dna[i:i+3]
        if codon1 == 'atg':
            position1 = i # marking the position of the start codon
            for j in range(position1, len(dna), 3):
                codon2 = dna[j:j+3]
                if codon2 in ['taa', 'tag', 'tga']:
                    position2 = j  # position of the stop codon
                    yield (position1,position2-position1+3, dna[position1:position2+3]) # return the position of ORF, its length and the values
                    break




Function to iterate through all the sequences in the FASTA file and store the lengths and starting position of each of the open reading frame

In [341]:
#iterating through each sequence to find all the ORF in each sequence

def finding_ORF(frame,seqs):
    sequences=list(seqs.values()) #take the values of the sequences as a list to iterate through it
    ORF_lengths={}  #dictionary to store length of the ORF of each sequence and its ID
    positions={}  #dictionary to store position of the ORF of each sequence and its ID
    for read in range(len(sequences)): #loop for accessing each sequence read from the list
        seq=str(sequences[read])
        a=(list(startstop_codon(seq, frame))) #calling the codon reading function
        orf_lengths= [item[1] for item in a]  #get the length
        orf_position=[item[0] for item in a]  # get the position
        if  len(orf_lengths)== 0: #assigning value for reads with no codons
             orf_lengths = [0] #assigning value for reads with no codons
             orf_position = ['NULL']
        ORF_lengths[list(seqs.keys())[read]]=orf_lengths#creating a dictionary of ORF lengths(value) and sequence IDs(key)
        positions[list(seqs.keys())[read]]=orf_position
    return (ORF_lengths, positions)

ORF_lengths, positions= finding_ORF(0,seqs)


Maximum lengths of ORF corresponding to each sequence

In [342]:

#maximum length ORF for each sequence
position=[]
for name,seq in ORF_lengths.items():
    
    max_value=max(seq)
    print("Longest ORF in Frame 1 with sequence id %s is of length %d " % (name ,max_value))
    position.append(seq.index(max_value))

    

Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|91 is of length 1296 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|304 is of length 105 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|255 is of length 1443 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|45 is of length 2394 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|396 is of length 1059 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|250 is of length 1560 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|322 is of length 0 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|88 is of length 120 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|594 is of length 42 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|293 is of length 312 
Longest ORF in Frame 1 with sequence id gi|142022655|gb|EQ086233.1|75 is of length 180 
Longest ORF in Frame 1 

In [343]:
#Get the value and the id of the ORF with the maximum length
max_value = max(max(ORF_lengths.values()))
print("The length of the longest ORF is ",max_value)
max_id=max(ORF_lengths, key=ORF_lengths.get)
print("The corresponding sequence id is ", max_id)


The length of the longest ORF is  2394
The corresponding sequence id is  gi|142022655|gb|EQ086233.1|45


In [None]:
Positions of maximum lengths of ORF corresponding to each sequence

In [317]:
# position of maximum length ORF for each sequence
for ((name,seq),i) in zip(positions.items(),position):
        print("The position  corresponding to the longest ORF in sequence id %s is  %s" % (name , seq[i]))
    

The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|91 is  2855
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|304 is  620
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|255 is  1640
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|45 is  698
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|396 is  428
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|250 is  1373
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|322 is  89
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|88 is  NULL
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|594 is  65
The position  corresponding to the longest ORF in sequence id gi|142022655|gb|EQ086233.1|293

In [250]:
#For a given sequence identifier, what is the longest ORF contained in the sequence represented by that identifier? 


def find_id(id):
    max_length= max(ORF_lengths[id])
    return(max_length)

find_id('gi|142022655|gb|EQ086233.1|16')


1509