In [1]:
import re
import math

#### DAY 1

# Exercise 1:
We will write a function that:
 - Receives the path of a fasta file with the sequences we want to align
 - Returns a dictionary where keys are the name of the sequence and the value is the sequence


In [2]:
def readFastaFile(file):
    sequences = {}
    
    with open(file) as f:
        for line in f:
            # If this line is name line
            if re.match(r'^>', line):
                matchObject = re.match(r'^>(\S*)\s*(.*)', line)
                
                if (matchObject):
                    name = matchObject.group(1)
                    sequences[name] = ''
                inseq = 0
            # Otherwise
            else:
                if inseq == 0:
                    inseq = 1
                    sequences[name] = line.strip()
                else:
                    sequences[name] += line.strip()
    return sequences

In [14]:
def readFastaFile(file):
    sequences = {}
    with open(file) as f:
        for line in f:
            # If this line is name line
            if re.match(r'^>', line):
                matchObject = re.match(r'^>(\S*)\s*(.*)', line)

                if (matchObject):
                    name = matchObject.group(1)
                    feat = matchObject.group(1)
                    sequences[name] = \
                        Sequence(name=name, sequence='', features=feat) 
            # Otherwise
            else:
                sequences[name].appendSequence(line.strip())
    return sequences

In [3]:
sequences = readFastaFile('cyc1_sequences.txt')

#### DAY 2

# Exercise 2:

We will write a function that:
 - Receives the path of a fasta file with the alignment of unbiased sequences
 - Returns a 2D dictionary where keys are the name of the residues and the value is the score


In [4]:
alp="ACDEFGHIKLMNPQRSTVWY" # this is the AA alphabet

def generateMatrix(alignment):
    totResidues = 0
    totSubtitut = 0
    
    # Initialize to zero the total residue counting
    freqr = {}
    for res in alp:
        freqr[res] = 0
    
    # Initialize to zero the subtition matrix
    matrix = {}
    for res1 in alp:
        matrix[res1] = {}
        for res2 in alp:
            matrix[res1][res2] = 0
            
    
    # Count how many time each residue is found
    # Mind the Gap!
    for s in alignment:
        for aa in alignment[s]:
            if aa != '-': 
                freqr[aa] += 1
                totResidues += 1
    
    # Get the frequency of each residue
    for aa in freqr:
        freqr[aa] /= totResidues
    
    # Since dictionaries are not good to numerate in a fix manner
    # Lets transform the values into a tuple
    
    sequences = tuple(alignment[i] for i in alignment)
    nseq = len(sequences)
    
    for i in range(0, nseq):
        for j in range(i+1, nseq):
            seqI = sequences[i]
            seqJ = sequences[j]
            if len(seqI) != len(seqJ): return "ERROR!!"
            
            for aa in range(0, len(seqI)):
                aaI = seqI[aa]
                aaJ = seqJ[aa]
                if aaI != '-' and aaJ != '-':
                    matrix[aaJ][aaI] += 1
                    matrix[aaI][aaJ] += 1
                    totSubtitut += 2
    
    # mathematical transformation 
    for aa1 in alp:
        for aa2 in alp:
            num = matrix[aa1][aa2] / totSubtitut
            den = freqr[aa1]*freqr[aa2]
            matrix[aa1][aa2] = math.log10(num/den) * 10
        
    return matrix

In [5]:
msa = readFastaFile('../msa1.fasta')
matrix = generateMatrix(msa)

# Exercise 3:
We will write a function that:
 - Receives the path of a matrix (.mat) file with a substitution matrix
 - Returns a 2D dictionary where keys are the name of the residues and the value is the score


In [6]:
def readMatrix(filename):
    # Read the file
    handle = open(filename, 'r')
    content= handle.readlines()
    handle.close()
    
    # Set up the matrix file
    matrix  = {}
    letters = []
    numline = len(content) 
    
    for nl in range(0, numline):
        line = content[nl]
        splt = line.split()
        a = splt[0]
        if a not in matrix:
            matrix[a] = {}
            letters.append(a)
            
    # Go throug the file and save the values
    for nl in range(0, numline):
        line = content[nl]
        splt = line.split()
        l = len(splt)
        aa1 = splt[0]
        for a in range(0, len(letters)):
            aa2 = letters[a]
            matrix[aa1][aa2] = splt[a]
            matrix[aa2][aa1] = splt[a]
    
    return matrix

# Exercise 4:
We will write a function that:
 - Receives two sequences
 - Returns both sequences aligned

In [7]:
seqI = 'GELFANDTHEFASTCAT'
seqJ = 'GARFIELDTHEFATCAT'

def pairAlignment(seqI, seqJ, matrix):
    
    # Check that there are no gaps in this sequences
    # I am paranoid
    seqI.replace('-', '')
    seqJ.replace('-', '')

    # Get the length of the sequences
    lenI=len(seqI)
    lenJ=len(seqJ)

    # Penalty for putting a gap
    gep=-4

    
    # Initiallize to zero the matrices
    smat = [[0 for x in range(lenJ+1)] for y in range(lenI+1)]
    tb   = [[0 for x in range(lenJ+1)] for y in range(lenI+1)]

    # Base cases
    for i in range (0, lenI+1):
        smat[i][0]=0
        tb[i][0]=-2

    for j in range (0, lenJ+1):
        smat[0][j]=0
        tb[0][j]=-2

    # Fill the table
    bscore=0
    for i in range (1, lenI+1):
        for j in range (1, lenJ+1):
            if seqI[i-1]!='-' and seqJ[j-1]!='-':
                s=int(matrix[seqI[i-1]][seqJ[j-1]])
            else:
                s=0

            Sub=smat[i-1][j-1]+ s
            Del=smat[i][j-1]+ gep
            Ins=smat[i-1][j]+ gep

            if Sub>Del and Sub>Ins:
                smat[i][j]= Sub
                tb  [i][j]= 0
            elif Del>Ins:
                smat[i][j]= Del
                tb[i][j]= -1
            else:
                smat[i][j]= Ins
                tb[i][j]= 1

            if smat[i][j]>bscore:
                bscore=smat[i][j]

    # Traceback
    alnI=''
    alnJ=''
    while (tb[i][j]!=-2):
        if (tb[i][j]==0):
            i-=1
            j-=1
            alnI += seqI[i]
            alnJ += seqJ[j]
        elif (tb[i][j]==-1):
            j-=1
            alnI += '-'
            alnJ += seqJ[j]
        elif (tb[i][j]==1):
            i-=1
            alnI += seqI[i]
            alnJ += '-'


    # This is used to reverse the sequences
    alnI=alnI[::-1]
    alnJ=alnJ[::-1]

    return alnI, alnJ, bscore

In [8]:
seq1, seq2, score = pairAlignment(seqI, seqJ, matrix)
print(seq1, seq2, "The score is "+str(score), sep='\n')

GELFAN-DTHEFASTCAT
GARFIELDTHEFA-TCAT
The score is 88


In [9]:
class Seq:
    def __init__(self):
        self.id=''
        self.seq=''
        self.features="z"

class Node:
    def __init__(self):
        self.name=""
        self.left=0
        self.right=0
        self.parent=0

    def printNode(self):
        print('Name: '   + str(self.name))
        print('Parent: ' + str(self.parent))
        print('Right: '  + str(self.right))
        print('Left: '   + str(self.left))
        print('.......')
        


In [11]:
def declare_new_tree_node(nodes, nn):
    nodes[nn] = Node()
    return (nn, nn+1)


def scan_name_and_dist (i, l):
    name=""
    number=""
    if l[i]==';': return ("",-1,i)
     
    while l[i]!=':' and i<len(l) and l[i]!=')' and l[i]!=';' and l[i]!=',':
        name+=l[i]
        i+=1
     
    if l[i]!=':':
        distance=float(0)
        return (name,distance, i)
    else:
        i+=1
     
    while  str.isdigit(l[i]) or l[i]=='e' or l[i]=='-' or l[i]=='.': 
        number+=l[i]
        i+=1

    number=float(number)
    return (name, number,i)

def newick2nodes (line):
    nodes={}
    nodes[0]=-1
    nn=1 # root starts at 1
    (N,nn)=declare_new_tree_node(nodes, nn)
    T=R=N
   
    c=pi=i=0
    while (line[i])!=';':
        c=line[i]
        i+=1
        if c=='(':
            (N, nn)=declare_new_tree_node(nodes,nn)
            nodes[N].parent=T

            if nodes[T].right==0:
                nodes[T].right=N
            elif nodes[T].left==0:
                nodes[T].left=N
            else:
                nodes[N].right=nodes[T].right
                nodes[nodes[T].right].parent=N

                nodes[N].left=nodes[T].left
                nodes[nodes[T].left].parent=N

                nodes[T].right=N

                (N,nn)=declare_new_tree_node(nodes,nn)

                nodes[T].left=N
                nodes[N].parent=T

            T=N
            lastc=0
        
        elif c==')':
            T=nodes[T].parent
            (nodes[T].name,nodes[T].distance,i)=scan_name_and_dist (i,line)
            if nodes[T].name and nodes[T].name[0]:
                nodes[T].bootstrap=float(nodes[T].name)
                nodes[T].name=""
            lastc=0;
        
        elif c==',':
            T=nodes[T].parent;
            lastc+=1
        else:
            (N,nn)=declare_new_tree_node(nodes,nn)
            nodes[N].parent=T

            if nodes[T].right==0:
                nodes[T].right=N
            elif nodes[T].left==0:
                nodes[T].left=N    
            else:
                nodes[N].right=nodes[T].right
                nodes[nodes[T].right].parent=N

                nodes[N].left=nodes[T].left
                nodes[nodes[T].left].parent=N

                nodes[T].right=N


                (N,nn)=declare_new_tree_node(nodes,nn)
                nodes[T].left=N
                nodes[N].parent=T

            T=N
            i=i-1
        
            (nodes[T].name,nodes[T].distance,i)=scan_name_and_dist (i,line);
            lastc=0
        
    T=nodes[T].parent
   
    if nodes[T].right==0 and nodes[T].left!=0:
        T=nodes[T].left

    elif nodes[T].right!=0 and nodes[T].left==0:
        T=nodes[T].right

    nodes[T].parent=-1
    return (nodes,nn)
#The main code starts here

# Exercise 5:
We will write a function that:
 - Receives dicionary of a tree 
 - Returns when we have to align

In [12]:
def node2splits(N, nodes):#, seq, matrix, gep):
     
    ## Base Case
    ## We are a leaf
    lst=[]
    if nodes[N].name != '':
        lst.append(nodes[N].name)
        
    else:
        left_list=[]
        right_list=[]
        if nodes[N].left:
            left_list=node2splits(nodes[N].left, nodes)#,seq,matrix,gep)
        if nodes[N].right:
            right_list=node2splits(nodes[N].right, nodes)#,seq,matrix,gep)
                        
        lst=left_list+right_list
        
    return lst

In [13]:
tree=""
with open ('tree.new') as f:
    for line in f:
        tree+=line

tree=re.sub(r'[ \n\t\r]',"",tree)

nodes = newick2nodes(tree)[0]
print(nodes)


for i in nodes:
    if i==0:
        continue
    print('id', i)
    print(nodes[i].printNode())
    print()

{0: -1, 1: <__main__.Node object at 0x7fa5d0240278>, 2: <__main__.Node object at 0x7fa5d02405c0>, 3: <__main__.Node object at 0x7fa5d02401d0>, 4: <__main__.Node object at 0x7fa5d0240208>, 5: <__main__.Node object at 0x7fa5d0240898>, 6: <__main__.Node object at 0x7fa5d0240400>, 7: <__main__.Node object at 0x7fa5d02409e8>, 8: <__main__.Node object at 0x7fa5d0240908>}
id 1
Name: 
Parent: 0
Right: 2
Left: 0
.......
None

id 2
Name: 
Parent: -1
Right: 7
Left: 8
.......
None

id 3
Name: 
Parent: 7
Right: 4
Left: 5
.......
None

id 4
Name: hmgb_chite
Parent: 3
Right: 0
Left: 0
.......
None

id 5
Name: hmgl_wheat
Parent: 3
Right: 0
Left: 0
.......
None

id 6
Name: hmgl_trybr
Parent: 7
Right: 0
Left: 0
.......
None

id 7
Name: 
Parent: 2
Right: 3
Left: 6
.......
None

id 8
Name: hmgt_mouse
Parent: 2
Right: 0
Left: 0
.......
None



# Exercise 6: The group align function

In [15]:
 def align (seq,Igroup, Jgroup, matrix, gep):
    
    lenI=len(Igroup[0])
    lenJ=len(Jgroup[0])
    nI = len(Igroup)
    nJ = len(Jgorup)
    
    smat = [[0 for x in range(lenJ+1)] for y in range(lenI+1)]
    tb   = [[0 for x in range(lenJ+1)] for y in range(lenI+1)]

    for i in range (0, lenI+1):
        smat[i][0]=i*gep
        tb[i][0]=j

    for j in range (0, lenJ+1):
        smat[0][j]=j*gep
        tb[0][j]=-1


    for i in range (1, lenI+1):
        for j in range (1, lenJ+1):
            s = 0 
            nsub = 0
            for ni in range (0,nI):
                for nj in range (0, nJ):
                    a1=seq[Igroup[ni]][i-1]
                    a2=seq[Igroup[nj]][j-1]
                    if a1!='-' and a2!='-':
                        s += int(matrix[a1.upper()][a2.upper()])
                        nsub+=1
            if (nsub>0):
                s/=nsub

            Sub=smat[i-1][j-1]+s
            Del=smat[i][j-1]+gep
            Ins=smat[i-1][j]+gep

            if Sub>Del and Sub >Ins:
                smat[i][j]=Sub
                tb  [i][j]=0  
            elif Del>Ins:
                smat[i][j]=Del
                tb[i][j]=-1
            else:
                smat[i][j]=Ins
                tb[i][j]=1


    #print "Optimal Score: %d\n"%(int(smat[lenI][lenJ]))
    i=lenI
    j=lenJ
    lenA=0
    alnI=[]
    alnJ=[]
    new_seq={}
    for ni in range (0,len (Igroup)):
        new_seq[Igroup[ni]]=""
    for nj in range (0,len (Jgroup)):
        new_seq[Jgroup[nj]]=""
        
    while ((i==0 and j==0)!=1):
        if (tb[i][j]==0):
            i-=1
            j-=1
            for ni in range (0,len (Igroup)):
                new_seq[Igroup[ni]]+=seq[Igroup[ni]][i]
            for nj in range (0,len (Jgroup)):
                new_seq[Jgroup[nj]]+=seq[Jgroup[nj]][j]    

        elif (tb[i][j]==-1):
            j-=1
            for ni in range (0,len (Igroup)):
                new_seq[Igroup[ni]]+='-'
            for nj in range (0,len (Jgroup)):
                new_seq[Jgroup[nj]]+=seq[Jgroup[nj]][j]
                
        elif (tb[i][j]==1):
            i-=1
            for ni in range (0,len (Igroup)):
                new_seq[Igroup[ni]]+=seq[Igroup[ni]][i]
            for nj in range (0,len (Jgroup)):
                new_seq[Jgroup[nj]]+="-"
                
        lenA+=1
            
    for ni in range (0, nI):
        seq[Igroup[ni]]=new_seq[Igroup[ni]][::-1]
    for nj in range (0, nJ):
        seq[Jgroup[nj]]=new_seq[Jgroup[nj]][::-1]
    return seq    

In [74]:
def declare_new_tree_node(nodes, nn):
    print("Created node", nn)
    nodes[nn] = Node()
    return (nn, nn+1)

def newick2nodes(line):
        nodes   = {}
        nodes[0]= -1
        
        nn = 1
        N, nn = declare_new_tree_node(nodes,nn)
        T, R = 1, 1

        i = 0
        while (line[i])!=';':
            c=line[i]
            i+=1
            if c=='(':
                N, nn = declare_new_tree_node(nodes,nn)
                nodes[N].parent = T
                #print('N:', N, '\t', 'T:', T)
                # N is this new node
                # T is its parent
                
                # Modify parent's child
                if nodes[T].right==0: 
                    nodes[T].right=N
                elif nodes[T].left==0:
                    nodes[T].left=N
                else:
                    nodes[N].right = nodes[T].right
                    nodes[nodes[T].right].parent=N

                    nodes[N].left=nodes[T].left
                    nodes[nodes[T].left].parent=N

                    nodes[T].right=N

                    N, nn=declare_new_tree_node(nodes,nn)

                    nodes[T]._left=N
                    nodes[N]._parent=T

                T=N

            elif c==')':
                T=nodes[T].parent
                nodes[T].name, nodes[T].distance, i =scan_name_and_dist (i,line)
                if nodes[T].name and nodes[T].name[0]:
                    nodes[T].name=""

            
            # 
            elif c==',':
                T = nodes[T].parent;

            else: ## Closing!
                (N,nn)=declare_new_tree_node(nodes,nn)
                nodes[N].parent=T

                if nodes[T].right==0:
                    nodes[T].right=N
                elif nodes[T].left==0:
                    nodes[T].left=N    
                else:
                    nodes[N].right=nodes[T].right
                    nodes[nodes[T].right].parent=N

                    nodes[N].left=nodes[T].left
                    nodes[nodes[T].left].parent=N

                    nodes[T].right=N


                    N,nn=declare_new_tree_node(nodes,nn)
                    nodes[T].left=N
                    nodes[N].parent=T

                T=N
                i=i-1

                nodes[T].name, nodes[T].distance,i = scan_name_and_dist(i,line);


        T=nodes[T].parent

        if nodes[T].right==0 and nodes[T].left!=0:
            T=nodes[T].left

        elif nodes[T].right!=0 and nodes[T].left==0:
            T=nodes[T].right

        nodes[T].parent=-1
        return (nodes,nn)


In [73]:
newick2nodes(tree)

Created node 1
Created node 2
Created node 3
Created node 4
Created node 5
Created node 6
Created node 7
Created node 8


({0: -1,
  1: <__main__.Node at 0x7f402c320c50>,
  2: <__main__.Node at 0x7f402c3072b0>,
  3: <__main__.Node at 0x7f402c313ef0>,
  4: <__main__.Node at 0x7f402c330080>,
  5: <__main__.Node at 0x7f402c30f710>,
  6: <__main__.Node at 0x7f402c3061d0>,
  7: <__main__.Node at 0x7f402c31d390>,
  8: <__main__.Node at 0x7f402c326a90>},
 9)

In [54]:
last

({0: -1,
  1: <__main__.Node at 0x7f402c517630>,
  2: <__main__.Node at 0x7f402c517828>,
  3: <__main__.Node at 0x7f402c517b38>,
  4: <__main__.Node at 0x7f402c517d30>,
  5: <__main__.Node at 0x7f402c5174e0>,
  6: <__main__.Node at 0x7f402c517c50>,
  7: <__main__.Node at 0x7f402c517550>,
  8: <__main__.Node at 0x7f402c517208>},
 9)

In [76]:
def readNeweck(newick):
    nodes = {}
    stack = []
    i = 0
    while newick[i] != ';':
        
        element = newick[i]
        
        if element = '(':
            stack.append('(')
            
        if elemnt

ImportError: No module named 'stack'