In [1]:
#-------------------------------------------------------------------------------
# utilise ONLY ONE stop codons, returns ONLY ONE ORF
def OpenReadingFrame(DNASequence,minLen,maxLen):
    '''
    Finds first start codon, starts translating to peptide.
    Stops and rejects sequence if upper length limit hit.
    Stops translation at first in-frame stop codon.
    Rejects peptide if below lower length limit.
    If acceptable length peptide found, returns as tuple: (CDS, peptide)
    Otherwise, truncates sequence to remove initial start codon, looks for the next one, then repeats translation attempt
    '''
    #setting up codon table
    translation = {
    "TTT": "F", "TCT": "S", "TAT": "Y", "TGT": "C",
    "TTC": "F", "TCC": "S", "TAC": "Y", "TGC": "C",
    "TTA": "L", "TCA": "S", "TAA": "*", "TGA": "*",
    "TTG": "L", "TCG": "S", "TAG": "*", "TGG": "W",

    "CTT": "L", "CCT": "P", "CAT": "H", "CGT": "R",
    "CTC": "L", "CCC": "P", "CAC": "H", "CGC": "R",
    "CTA": "L", "CCA": "P", "CAA": "Q", "CGA": "R",
    "CTG": "L", "CCG": "P", "CAG": "Q", "CGG": "R",

    "ATT": "I", "ACT": "T", "AAT": "N", "AGT": "S",
    "ATC": "I", "ACC": "T", "AAC": "N", "AGC": "S",
    "ATA": "I", "ACA": "T", "AAA": "K", "AGA": "R",
    "ATG": "M", "ACG": "T", "AAG": "K", "AGG": "R",

    "GTT": "V", "GCT": "A", "GAT": "D", "GGT": "G",
    "GTC": "V", "GCC": "A", "GAC": "D", "GGC": "G",
    "GTA": "V", "GCA": "A", "GAA": "E", "GGA": "G",
    "GTG": "V", "GCG": "A", "GAG": "E", "GGG": "G"}
    
    
    #setting start codon to look for
    StartCodon = 'ATG'
    
    #making copy of input (need to trim to start codon without affecting final output indexing)
    SubString = DNASequence
    print(SubString)
    #looping over every available start codon, from the start, until acceptable peptide found
    while StartCodon in SubString:
#        print 'found start codon'
        #find first start codon
        StartIndex = SubString.find(StartCodon)
        #trim sequence to start at start codon
        SubString = SubString[SubString.find(StartCodon):]
        #set up empty peptide sequence
        PeptideSequence = ''
        #making copy of input (need to trim during translation without affecting position of start codon search)
        translationString = SubString
        #scanning down every three letter set
        while len(translationString) > 3:
#            print 'current string: ' +str(translationString[0:3])+' '+str(translationString[3:])
#            print 'translated to: '+ str(translation[translationString[0:3]])
            #adding codon translation to peptide sequence
            if 'N' in translationString[0:3]:
                PeptideSequence += 'X'
            else:
                PeptideSequence += translation[translationString[0:3]]
            #trimming sequence by last translated codon
            translationString = translationString[3:]
#            print 'current peptide: '+str(PeptideSequence)
            #if upper length limit hit, stop translating
            print(PeptideSequence)
            if (len(PeptideSequence)>maxLen):
#                print '--length limit hit--'
                return SubString[0:(len(PeptideSequence)*3)],PeptideSequence
                break
            #if in frame stop codon found, check if length is ok. Either accept and stop or reject and continue
            if '+' in PeptideSequence:
                if len(PeptideSequence)>minLen:
                    return SubString[0:(len(PeptideSequence)*3)],PeptideSequence
                    break
                else:
                    break
            elif '*' in PeptideSequence:
                if len(PeptideSequence)>minLen:
                    return SubString[0:(len(PeptideSequence)*3)],PeptideSequence
                    break
                else:
                    break
        #remove first 3 bases(start codon) and continue search
        SubString = SubString[3:]

#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# read data file (.fastq extension)
#RawDataFile = open("/Users/NikitaLoik/Documents/R6.fastq", 'r')
#Lines = RawDataFile.readlines()
#RawDataFile.close
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# creat a .CSV file with sequence list, DNA sequences and peptide sequences
#DNASequenceFile = open('TestR6.csv', 'w')
#SelectionRound = 6
#SequenceCounter = 0
#for Line in Lines:
#    ORF = OpenReadingFrame(Line)
#    if ORF != None:
#        PeptideSequence = Translation(ORF)
#        SequenceCounter += 1
#        DNASequenceFile.write('selection round # ' + str(SelectionRound) + ',' +
#                            'ORF ' + str(SequenceCounter) + ',' +
#                            ORF + ',' +
#                            PeptideSequence + '\n')
#DNASequenceFile.close 
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# return a list of lists with peptide-sequences and their frequencies, sorted by frequency in descending order
def SortedPeptideSequencesList(fastqFileLocation):
    RawDataFile = open(fastqFileLocation, 'r')
    Lines = RawDataFile.readlines()
    RawDataFile.close
    PeptideSequences = {}
    PeptideSequencesList = []
    
    SelectionRoundNumber = fastqFileLocation[fastqFileLocation.find('.')-1]
    
    # populate the dictionary, so that Peptides are the keys and 

    #minLen = input("Minimum sequence length desired: ")
    #maxLen = input("Maximum sequence length desired: ")
    # minLen = 38
    # maxLen = 42

    # 5S5 (LazBF) primers (d.t.a.F95)
    beginning = 'TTGCCGGAAAACGGGGCG' # codons for LPENGA
    end = 'GGAGGATACCCATACGACGTGCCCGACTATGCAGTGTTAAATGAA' # codons for GGYPYDVPDYAVLNE
    minLen = 114
    maxLen = 116
    
    for Line in Lines:
        if Line.startswith('@'): # skip non-seq lines
            continue
        # filter by expected beginning/end sequences
        Line = OpenReadingFrame(Line, minLen, maxLen)
        print(Line)
        if (beginning in Line) and (end in Line):
            print(Line)
            break
            # Line = OpenReadingFrame(Line,minLen,maxLen)
            if Line != None:
                ORF = Line[0]
                #print str(ORF)
                Peptide = Line[1]
                #print str(Peptide)
                if Peptide not in PeptideSequences:
                    PeptideSequences[str(Peptide)] = 1
                else:
                    PeptideSequences[str(Peptide)] = PeptideSequences[str(Peptide)] + 1

    # convert the dictionary into the list of lists
    for key, value in PeptideSequences.items():
        PeptideSequencesList.append([str(SelectionRoundNumber), key, value])
    # sort the PeptideSequenceList by peptide sequence occurence in descendent order
    SortedPeptideSequences = sorted(PeptideSequencesList, key = lambda x: x[2], reverse = True)
    return SortedPeptideSequences
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# creat a .CSV file with peptide-sequences list and their frequency, sorted by frequency in descending order
def SelectionRoundSortedSequenceListGenerator(SortedPeptideSequecesFileName, fastqFileLocation):
    
    SelectionRoundNumber = fastqFileLocation[fastqFileLocation.find('.')-1]
    
    SortedSequenceFile = open(SortedPeptideSequecesFileName, 'w')
    SortedPeptideSequences = SortedPeptideSequencesList(fastqFileLocation)
    
    TotalPeptideNumber = 0
    for Data in SortedPeptideSequences:
        TotalPeptideNumber = TotalPeptideNumber + Data[2]
        
    SortedSequenceFile.write('selection round # ' + str(SelectionRoundNumber) + '\n' +
                            'total sequence # ' + str(TotalPeptideNumber) + '\n')
                                            
    UniqueSequenceNumber = 0
    for Data in SortedPeptideSequences:
        UniqueSequenceNumber += 1
        PeptideSequence = Data[1]    
        SequenceFraction = float(Data[2])/float(TotalPeptideNumber)
        
        SortedSequenceFile.write('ORF ' + str(UniqueSequenceNumber) + ',' +
                                str(SelectionRoundNumber) + ',' +
                                PeptideSequence + ',' +
                                str(Data[2]) + ',' +
                                '{:.3%}'.format(SequenceFraction) + '\n')
    SortedSequenceFile.close    
#-------------------------------------------------------------------------------


#_____________________________RUNNING THE FUNCTION_____________________________#
#___SortedPeptideSequecesFileName, fastqFileLocation___

out_loc = '/work/users/d/i/dieckhau/mRNA-display-data/tmp.csv'
in_loc = '/work/users/d/i/dieckhau/mRNA-display-data/DRR337076.fastq'

SelectionRoundSortedSequenceListGenerator(out_loc, in_loc)

# SelectionRoundSortedSequenceListGenerator('ES-Ace_R1_001.csv', 'ES-Ace_R1_001.fastq')


TTGCCGGAAAATGGGGCGGATAAGGAGTAGCTGAGTACTCCGCTGTCTTGGGGAGGATACCCATACGACGTGCCCGACTATGCAGTGTTAAATGAATAGGACGGGGGGCGGAAA

M
MG
MGR
MGRI
MGRIR
MGRIRS
MGRIRSS
MGRIRSS*
M
MQ
MQC
MQC*
M
MN
MNR
MNRT
MNRTG
MNRTGG
MNRTGGG
None


TypeError: argument of type 'NoneType' is not iterable

In [12]:
import sys
import os
with open ('B7H3b_ND_-A2R6.csv') as f:
    file = f.readlines()
    
Lst1 = []

for row in file:
    Lst1.append(row.split(','))

Lst = []  
counter = 0

for line in Lst1:
    counter += 1
    x = counter
    y = line[2]
    z = line[3]
    n = line[4]
    Lst. append(">"+str(n))
    Lst. append(y)
    
orig_stdout = sys.stdout
Output = open("B7H3b_ND_-A2R6.fasta", 'w')
sys.stdout = Output

for c in Lst:
    print (c)
    


FileNotFoundError: [Errno 2] No such file or directory: 'B7H3b_ND_-A2R6.csv'

In [7]:
# Henry's attempt

in_file = '/work/users/d/i/dieckhau/mRNA-display-data/DRR337076.fastq'
out_file = '/work/users/d/i/dieckhau/mRNA-display-data/tmp.csv'

import numpy as np

def read_fastq(file, params):
    with open(file, 'r') as fopen:
        lines = fopen.readlines()

    lines = np.array(lines) # L x 4
    lines = np.reshape(lines, (lines.shape[0] // 4, 4)) # [L, 4]
    # 0 is header
    # 1 is seq
    # 2 is duplicate header
    # 3 is quality score
    print('TOTAL RAW READS:', lines.shape[0])
    
    # filter by length
    lengths = np.array([len(l) for l in lines[:, 1]])
    
    lines = lines[lengths == PARAMS['RAW_LENGTH'], :]
    print('READS OF CORRECT LENGTH:', lines.shape[0])
    
    u, c = np.unique(lengths, return_counts=True)
    print(np.unique(lengths, return_counts=True))
    print(u[np.argmax(c)], 'best length')
    print(lines.shape)
    

    return

PARAMS = {
    'RAW_LENGTH': 115
}

read_fastq(in_file, PARAMS)




TOTAL RAW READS: 3517658
READS OF CORRECT LENGTH: 3351776
(array([ 36,  39,  42,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,  70,  71,  72,  75,  79,  81,  82,  83,  84,  85,  86,
        87,  88,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
       101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
       127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 139, 140, 141,
       142, 145, 146, 147, 148, 149, 150, 151, 152]), array([      2,       2,       8,       8,       4,       5,      17,
            16,      11,      40,      22,      55,     123,      16,
            14,      24,      36,      17,      23,      10,     200,
           134,      32,      39,      30,       7,       2,      13,
             1,       2,       2,       1,       1,       6,       2,
             5,       2,       4,