In [1]:
import numpy as np 
import pandas as pd
import itertools as it

For a given tree T, we can calculate a distance between two leafs $i$ and $j$, noted as $d_{ij}(T)$

In [2]:
# distance matrix 

m1 = np.array([[0,8,7,12],
               [8,0,9,14],
               [7,9,0,11],
               [12,14,11,0]])

m2 = np.array([[0,2,3,8,14,18],
               [2,0,3,8,14,18],
               [3,3,0,8,14,18],
               [8,8,8,0,14,18],
               [14,14,14,14,0,18],
               [18,18,18,18,18,0]])

In [3]:
def convert_matrix_df(matrix,columns_name=None):
    return pd.DataFrame(matrix, columns=columns_name)

In [4]:
m1_df = convert_matrix_df(m1)
m1_df

Unnamed: 0,0,1,2,3
0,0,8,7,12
1,8,0,9,14
2,7,9,0,11
3,12,14,11,0


In [5]:
m2_df = convert_matrix_df(m2)
m2_df

Unnamed: 0,0,1,2,3,4,5
0,0,2,3,8,14,18
1,2,0,3,8,14,18
2,3,3,0,8,14,18
3,8,8,8,0,14,18
4,14,14,14,14,0,18
5,18,18,18,18,18,0


In the context of molecular phylogenetics, an additive matrix is a technique for displaying the evolutionary distances between sequences. This matrix shows the evolutionary changes that have taken place between various biological sequences, including sequences of DNA, RNA, and proteins.
Based on:
* Buneman’s 4-point condition Theorem:  M is additive if and only if the 4-point condition is satisfied
* 3-point condition Theorem: M is ultrametric if and only if the 3-point condition is satisfied

In [6]:
def is_additive(matrix):    
    comb = it.combinations(range(len(matrix)),4)
    for groupe in comb:
        i,j,k,l=groupe
        if not (matrix[i,j]+matrix[k,l]<=max(matrix[i,k]+matrix[j,l],matrix[i,l]+matrix[j,k])):
            return False
    return True

def is_ultrametrix(matrix):
    comb=it.combinations(range(len(matrix)),3)
    for groupe in comb:
        i,j,k=groupe
        if not(matrix[i,k] <= max(matrix[i,j], matrix[j,k])):
            return False
    return True

In [7]:
print("is M1 additive", is_additive(m1))
print("is M2 ultrametrix", is_ultrametrix(m2))

is M1 additive True
is M2 ultrametrix True


In [8]:
def cluster(df_matrix,i):
    return sum(df_matrix.iloc[:, i])

def all_cluster(df_matrix):
    for column in df_matrix.columns:
        print("Number of cluster in ",column,"is",cluster(df_matrix,column)) 

In [9]:
all_cluster(m1_df)

Number of cluster in  0 is 27
Number of cluster in  1 is 31
Number of cluster in  2 is 27
Number of cluster in  3 is 37


In [10]:
all_cluster(m2_df)

Number of cluster in  0 is 45
Number of cluster in  1 is 45
Number of cluster in  2 is 46
Number of cluster in  3 is 56
Number of cluster in  4 is 74
Number of cluster in  5 is 90


---

The Newick format is a technique for representing hierarchical tree structures. It is frequently used in computer science to depict hierarchical connections and in biology to describe phylogenetic trees, which show the evolutionary links between species.

UPGMA steps: 
1. Align & name 
2. Compare sequences using pairwise sequence alignment 
3. Count the mismatches and records them in the mismatche matrix 
4. Create a new cluster $u$ that joins the Closest Pair $(i,j)$ with the smallest distance $d_{i,j}$
5. Update the MatriX replace the rows and columns that correspond to the two clustered items with a new row and column. Based on the average distance from the newly.
6. Repeat step 4 and 5 until we get one cluster  


In [11]:
# Sequences 
a = "ATCGATCG"
b = "GTAGACGA"
c = "ACCGTACG"
d = "TCAGTCAG"
e = "GCCTACAG"

In [12]:
# Compute pairwise sequence alignment
# Both sequences must possess identical lengths.
def pairwise_sequence_alignment(seq_1,seq_2):
    count_missmatch = 0
    for i in range(len(seq_1)):
        if seq_1[i] != seq_2[i]:
            count_missmatch += 1
    return count_missmatch

In [13]:
# Count the mismatches and records them in the mismatche matrix
# min of mismatche_matrix is added in this function since we will use it later 
def  mismatche_matrix(seq_list):
    n = len(seq_list)
    mismatch_m = np.zeros((n,n))
    min = np.inf

    for i in range(n):
        sub_seq_list = seq_list[i+1:]
        for j in range(len(sub_seq_list)):
            matrix_j = j+i+1
            pairwise_seq = pairwise_sequence_alignment(seq_list[i],sub_seq_list[j])
            mismatch_m[i][matrix_j] = mismatch_m[matrix_j][i] = pairwise_seq
            if pairwise_seq > 0 and pairwise_seq < min:
                min = pairwise_seq
                min_index = [i,matrix_j]

    return mismatch_m, min, min_index

In [14]:
seq_list = [a,b,c,d,e]
seq_names = ['A','B','C','D','E']
tree_newick_format = ""
mismatch_m,min_value, min_index = mismatche_matrix(seq_list)
mismatch_matrix_df = pd.DataFrame(data=mismatch_m, columns=seq_names ,index=seq_names)
print(mismatch_matrix_df)
i_min = min_index[0]
j_min = min_index[1]
print("The closest pairs are",seq_names[i_min],seq_names[j_min],"with mismatch score equal to ", mismatch_m[i_min,j_min])
min_len = min_value / 2
tree_newick_format = "("+str(seq_names[i_min])+":"+str(min_len)+","+str(seq_names[j_min])+":"+str(min_len)+")"
print(tree_newick_format)

     A    B    C    D    E
A  0.0  5.0  3.0  6.0  5.0
B  5.0  0.0  7.0  5.0  5.0
C  3.0  7.0  0.0  4.0  5.0
D  6.0  5.0  4.0  0.0  4.0
E  5.0  5.0  5.0  4.0  0.0
The closest pairs are A C with mismatch score equal to  3.0
(A:1.5,C:1.5)


In [15]:
mismatch_m

array([[0., 5., 3., 6., 5.],
       [5., 0., 7., 5., 5.],
       [3., 7., 0., 4., 5.],
       [6., 5., 4., 0., 4.],
       [5., 5., 5., 4., 0.]])

In [16]:
# compute the new 
mismatch_matrix_df_t = mismatch_matrix_df.copy()
mismatch_matrix_df_t

Unnamed: 0,A,B,C,D,E
A,0.0,5.0,3.0,6.0,5.0
B,5.0,0.0,7.0,5.0,5.0
C,3.0,7.0,0.0,4.0,5.0
D,6.0,5.0,4.0,0.0,4.0
E,5.0,5.0,5.0,4.0,0.0


In [17]:
# add new cluster 
i_min, j_min = min_index[0], min_index[1]
cluster_i = str(seq_names[i_min])
cluster_j = str(seq_names[j_min])
new_cluster_name = cluster_i+cluster_j
print(new_cluster_name)

AC


In [18]:
new_dist = []
for col in mismatch_matrix_df_t.columns: 
    if col is not cluster_i and col is not cluster_j :
        new_dist.append((mismatch_matrix_df_t[col][cluster_i]+mismatch_matrix_df_t[col][cluster_j]) / 2)
new_dist

[6.0, 5.0, 5.0]

In [19]:
# delete 
mismatch_matrix_df_t.drop(mismatch_matrix_df_t.columns[min_index], axis=1, inplace=True)
mismatch_matrix_df_t.drop(mismatch_matrix_df_t.index[min_index], axis=0, inplace=True)
mismatch_matrix_df_t

Unnamed: 0,B,D,E
B,0.0,5.0,5.0
D,5.0,0.0,4.0
E,5.0,4.0,0.0


In [20]:
mismatch_matrix_df_t[new_cluster_name] = new_dist
mismatch_matrix_df_t

Unnamed: 0,B,D,E,AC
B,0.0,5.0,5.0,6.0
D,5.0,0.0,4.0,5.0
E,5.0,4.0,0.0,5.0


In [21]:
# add row 
new_dist.append(0)
mismatch_matrix_df_t.loc[new_cluster_name] = new_dist
mismatch_matrix_df_t

Unnamed: 0,B,D,E,AC
B,0.0,5.0,5.0,6.0
D,5.0,0.0,4.0,5.0
E,5.0,4.0,0.0,5.0
AC,6.0,5.0,5.0,0.0


----

In [24]:
# update mismatch matrix
def update_mismatch_matrix(mismatch_matrix_df,min,min_index):
    # new cluster name 
    # delete 
    mismatch_matrix_df.drop(mismatch_matrix_df.columns[min_index], axis=1, inplace=True)
    mismatch_matrix_df.drop(min_index, axis=0, inplace=True)
    
    # add new cluster 
    i_min, j_min = min_index[0], min_index[1]
    new_cluster_name = str(seq_names[i_min])+str(seq_names[j_min])
    distance_cluster = []
    for in : 
     

    mismatch_matrix_df[new_cluster_name] = []
   


SyntaxError: invalid syntax (2618018363.py, line 12)

In [None]:
if len(mismatch_matrix_df) == 2:
    # joint directly the two clusters 
    pass
else: 
    # Find the closest pairs 
    # update the mismatch 
        
    pass