In [4]:
import numpy as np
import pandas as pd

In [2]:
# Compute pairwise sequence alignment
# Both sequences must possess identical lengths.
def pairwise_sequence_alignment(seq_1,seq_2):
    count_missmatch = 0
    for i in range(len(seq_1)):
        if seq_1[i] != seq_2[i]:
            count_missmatch += 1
    return count_missmatch

In [3]:
# Count the mismatches and records them in the mismatche matrix
# min of mismatche_matrix is added in this function since we will use it later 
# outuput: dataframe 
def  mismatche_matrix(seq_list, seq_names):
    n = len(seq_list)
    mismatch_m = np.zeros((n,n))

    for i in range(n):
        sub_seq_list = seq_list[i+1:]
        for j in range(len(sub_seq_list)):
            matrix_j = j+i+1
            pairwise_seq = pairwise_sequence_alignment(seq_list[i],sub_seq_list[j])
            mismatch_m[i][matrix_j] = mismatch_m[matrix_j][i] = pairwise_seq

    mismatch_matrix_df = pd.DataFrame(data=mismatch_m, columns=seq_names ,index=seq_names)
    
    return mismatch_matrix_df

In [5]:
# This function return Minimum value greater than 0 in a dataframe
def min_val_index_df(df): 
    # Finding the minimum value greater than 0
    min_val = df[df > 0].min().min()

    # Getting the indices of the minimum value
    indices = df[df == min_val].stack().index.tolist()
    
    return min_val, list(indices[-1])

In [6]:
# This function calculate the distance between the new cluster and the others
def cal_cluster_distance(df,min_indices):
    cluster_missmatch_socre = []
    cluster_i = min_indices[0]
    cluster_j = min_indices[1]
    for col in df.columns: 
        if col is not cluster_i and col is not cluster_j :
            cluster_missmatch_socre.append((df[col][cluster_i]+df[col][cluster_j]) / 2)
    return cluster_missmatch_socre

In [17]:
def upgma_algorithm(seq_list,seq_names):
    # Compute mismatch 
    df = mismatche_matrix(seq_list,seq_names)

    dict_tree_newick_format = {} 
    while len(df) > 1: 
        # Find the pairs (i,j)
        min_val, min_indices = min_val_index_df(df)

        # Create new cluster  for pairs (i,j)
        ## intialize the new cluster name u 
        new_cluster_name = min_indices[0]+min_indices[1]
        ## calculate the distance between u_i and u_j
        cluster_branch_distance = min_val/2
        ## update the tree_newick_format 
        i = str(min_indices[0])
        j = min_indices[1]

        if len(min_indices[0]) > 1:
            i = dict_tree_newick_format[min_indices[0]] 
        if len(min_indices[1]) > 1:
            j = dict_tree_newick_format[min_indices[1]] 

        tree_newick_format = "("+i+":"+str(cluster_branch_distance)+","+j+":"+str(cluster_branch_distance)+")"

        if len(new_cluster_name) > 1:
            dict_tree_newick_format[new_cluster_name] = tree_newick_format
            
        ## Compute the distance between the new cluster and the others 
        cluster_missmatch_socre = cal_cluster_distance(df,min_indices)


        # Upadte the mismatch matrix
        ## Delete pairs i j from the df 
        df.drop(min_indices, axis=1, inplace=True)
        df.drop(min_indices, axis=0, inplace=True)

        ## create new col with new cluster name and a new row 
        ### add col
        df[new_cluster_name] = cluster_missmatch_socre
        ### add row 
        cluster_missmatch_socre.append(0)
        df.loc[new_cluster_name] = cluster_missmatch_socre
    
    return list(dict_tree_newick_format.items())[-1][1]

In [18]:
# Sequences 
a = "ATCGATCG"
b = "GTAGACGA"
c = "ACCGTACG"
d = "TCAGTCAG"
e = "GCCTACAG"

seq_list = [a,b,c,d,e]
seq_names = ['A','B','C','D','E']
upgma_algorithm(seq_list,seq_names)

'(((E:2.0,D:2.0):2.5,(C:1.5,A:1.5):2.5):2.75,B:2.75)'

In [19]:
m3 = [[0,19,27,8,33,18,13],
      [19,0,31,18,36,1,13],
      [27,31,0,26,41,32,29],
      [8,18,26,0,31,17,14],
      [33,36,41,31,0,35,28],
      [18,1,32,17,35,0,12],
      [13,13,29,14,28,12,0]]
seq_names = ['A','B','C','D','E','F','G']
upgma_algorithm(m3,seq_names)

'((((((B:3.0,A:3.0):3.5,G:3.5):3.5,F:3.5):3.5,E:3.5):3.5,D:3.5):3.5,C:3.5)'