In [2]:
import numpy as np
import pandas as pd

def upgma(matrix, labels):
    # Initialisation
    n = len(matrix)
    clusters = {i: [labels[i]] for i in range(n)}
    distances = matrix.copy()

    # Convertir en DataFrame pour une gestion facile des distances
    df_distances = pd.DataFrame(distances, index=range(n), columns=range(n))

    while len(clusters) > 1:
        # Trouver les deux clusters les plus proches
        min_dist = np.inf
        to_merge = (None, None)
        for i in df_distances.index:
            for j in df_distances.columns:
                if i != j and df_distances.at[i, j] < min_dist:
                    min_dist = df_distances.at[i, j]
                    to_merge = (i, j)

        # Fusionner les deux clusters
        i, j = to_merge
        new_cluster = clusters[i] + clusters[j]
        clusters[i] = new_cluster
        del clusters[j]

        # Mettre à jour la matrice des distances
        for k in df_distances.index:
            if k != i and k in df_distances.columns:
                dist_i = df_distances.at[i, k]
                dist_j = df_distances.at[j, k]
                df_distances.at[i, k] = df_distances.at[k, i] = (dist_i + dist_j) / 2

        # Supprimer la colonne et la ligne du cluster fusionné
        df_distances = df_distances.drop(index=j, columns=j)

    # Obtenir le cluster final
    return list(clusters.values())

# Exemple de matrice de distances et labels
labels = ['Bsu', 'Bst', 'Lvi', 'Amo', 'Mlu']
data = [
    [0, 0.1715, 0.2147, 0.3091, 0.2326],
    [0.1715, 0, 0.2991, 0.3399, 0.2058],
    [0.2147, 0.2991, 0, 0.2795, 0.3943],
    [0.3091, 0.3399, 0.2795, 0, 0.4289],
    [0.2326, 0.2058, 0.3943, 0.4289, 0]
]

matrix = np.array(data)

# Test de l'algorithme UPGMA
resultat = upgma(matrix, labels)
print("Résultat de l'UPGMA:", resultat)


Résultat de l'UPGMA: [['Bsu', 'Bst', 'Mlu', 'Lvi', 'Amo']]


In [3]:
import numpy as np
import pandas as pd

def upgma(matrix, labels):
    # Initialisation
    n = len(matrix)
    clusters = {i: [labels[i]] for i in range(n)}
    distances = matrix.copy()

    # Convertir en DataFrame pour une gestion facile des distances
    df_distances = pd.DataFrame(distances, index=range(n), columns=range(n))
    history = []  # Liste pour sauvegarder les distances de chaque étape

    while len(clusters) > 1:
        # Trouver les deux clusters les plus proches
        min_dist = np.inf
        to_merge = (None, None)
        for i in df_distances.index:
            for j in df_distances.columns:
                if i != j and df_distances.at[i, j] < min_dist:
                    min_dist = df_distances.at[i, j]
                    to_merge = (i, j)

        # Sauvegarder les distances des clusters avant fusion
        history.append({
            'merged_clusters': (to_merge[0], to_merge[1]),
            'distances': df_distances.copy()
        })

        # Fusionner les deux clusters
        i, j = to_merge
        new_cluster = clusters[i] + clusters[j]
        clusters[i] = new_cluster
        del clusters[j]

        # Mettre à jour la matrice des distances
        for k in df_distances.index:
            if k != i and k in df_distances.columns:
                dist_i = df_distances.at[i, k]
                dist_j = df_distances.at[j, k]
                df_distances.at[i, k] = df_distances.at[k, i] = (dist_i + dist_j) / 2

        # Supprimer la colonne et la ligne du cluster fusionné
        df_distances = df_distances.drop(index=j, columns=j)

    # Sauvegarder les distances finales
    history.append({
        'merged_clusters': None,
        'distances': df_distances
    })

    # Obtenir le cluster final
    return list(clusters.values()), history

# Exemple de matrice de distances et labels
labels = ['Bsu', 'Bst', 'Lvi', 'Amo', 'Mlu']
data = [
    [0, 0.1715, 0.2147, 0.3091, 0.2326],
    [0.1715, 0, 0.2991, 0.3399, 0.2058],
    [0.2147, 0.2991, 0, 0.2795, 0.3943],
    [0.3091, 0.3399, 0.2795, 0, 0.4289],
    [0.2326, 0.2058, 0.3943, 0.4289, 0]
]

matrix = np.array(data)

# Test de l'algorithme UPGMA
resultat, historique_distances = upgma(matrix, labels)

# Afficher le résultat final
print("Résultat de l'UPGMA:", resultat)

# Afficher l'historique des distances
for step in historique_distances:
    print(f"Étape de fusion : {step['merged_clusters']}")
    print("Distances :")
    print(step['distances'])
    print()


Résultat de l'UPGMA: [['Bsu', 'Bst', 'Mlu', 'Lvi', 'Amo']]
Étape de fusion : (0, 1)
Distances :
        0       1       2       3       4
0  0.0000  0.1715  0.2147  0.3091  0.2326
1  0.1715  0.0000  0.2991  0.3399  0.2058
2  0.2147  0.2991  0.0000  0.2795  0.3943
3  0.3091  0.3399  0.2795  0.0000  0.4289
4  0.2326  0.2058  0.3943  0.4289  0.0000

Étape de fusion : (0, 4)
Distances :
        0       2       3       4
0  0.0000  0.2569  0.3245  0.2192
2  0.2569  0.0000  0.2795  0.3943
3  0.3245  0.2795  0.0000  0.4289
4  0.2192  0.3943  0.4289  0.0000

Étape de fusion : (2, 3)
Distances :
        0       2       3
0  0.0000  0.3256  0.3767
2  0.3256  0.0000  0.2795
3  0.3767  0.2795  0.0000

Étape de fusion : (0, 2)
Distances :
         0        2
0  0.00000  0.35115
2  0.35115  0.00000

Étape de fusion : None
Distances :
     0
0  0.0



In [3]:
import numpy as np
import pandas as pd


In [139]:
def search_min(matrix, col_name=None):
    print(type(matrix.columns[0]))
    print("matrix.columns ", matrix.columns)
    print("col_name ", col_name)
    print(matrix)

    if col_name != None:
        val_min = matrix[col_name].min()
        return val_min, (matrix[matrix[col_name]==val_min].index.to_list()[0], col_name)
    else:
        val_min = matrix.min(skipna=True).min()
        return val_min, matrix.stack().idxmin()

In [101]:
def fusion_seq(matrix, dict_dist):
    # dist_min, row_col = search_min(matrix)
    dict_dist[row_col] = dist_min

In [147]:
def calc_dist(matrix, row_col, flag):
    name_col_matrix = matrix.columns
    new_dist_list = []

    row, col = row_col

    # matrix = matrix.transpose()
    for i in name_col_matrix: 
        if i not in row_col:
            # print(matrix.loc[row, i])
            print("col-i ", col, i)
            print("row-i ", row, i)
            print("matrix.loc[row, i] [i, row] ",matrix.loc[row, i], matrix.loc[i, row])
            print("matrix.loc[col, i] [i, col] ",matrix.loc[col, i], matrix.loc[i, col])
            
            nb_groupe = str(row_col).split(',')
            
            if flag % 2 == 0:
                distance = (matrix.loc[row, i] + (matrix.loc[col, i]*(len(nb_groupe)-1))) / len(nb_groupe) #si transpose (triangle inf)
            else :
                distance = (matrix.loc[i, row] + (matrix.loc[i, col]*(len(nb_groupe)-1))) / len(nb_groupe) #si normal (triangle sup)
            new_dist_list.append(distance)
            print("\n")

    matrix = matrix.drop(index=[row, col])
    matrix = matrix.drop(columns=[col, row])
    
    # sinon
    if flag % 2 == 0:
        print("new_dist_list ", (new_dist_list))
        new_dist_list.extend([np.nan]*(len(matrix)-len(new_dist_list)+1))
        matrix.loc[str(row_col)] = [np.nan] * len(matrix)
        matrix[str(row_col)] = new_dist_list
    else:
        # si transpose
        matrix.loc[str(row_col)] = new_dist_list
        matrix[str(row_col)]=[np.nan] * len(matrix)
        
    
    print(matrix)
    return str(row_col), matrix

In [150]:
matrix = pd.read_csv("../matrice_dist_UPGMA.csv", index_col=0)
print(len(matrix))

dist_min, row_col = search_min(matrix)

dict_dist = dict()
fusion_seq(matrix, dict_dist)
print(dict_dist)

i = 1
new_target, matrix = calc_dist(matrix, row_col, i)

print(len(matrix))
while len(matrix) > 1:
    print("new_target ", new_target)
    dist_min, row_col = search_min(matrix, new_target)
    fusion_seq(matrix, dict_dist)
    new_target, matrix = calc_dist(matrix, row_col, i)
    print("i ",i)
    i+=1

# alterne entre transposer et normal ... : pq ? 


5
<class 'str'>
matrix.columns  Index(['Bsu', 'Bst', 'Lvi', 'Amo', 'Mlu'], dtype='object')
col_name  None
     Bsu     Bst     Lvi     Amo     Mlu
Bsu  NaN  0.1715  0.2147  0.3091  0.2326
Bst  NaN     NaN  0.2991  0.3399  0.2058
Lvi  NaN     NaN     NaN  0.2795  0.3943
Amo  NaN     NaN     NaN     NaN  0.4289
Mlu  NaN     NaN     NaN     NaN     NaN
{('Bsu', 'Bst'): 0.1715}
col-i  Bst Lvi
row-i  Bsu Lvi
matrix.loc[row, i] [i, row]  0.2147 nan
matrix.loc[col, i] [i, col]  0.2991 nan


col-i  Bst Amo
row-i  Bsu Amo
matrix.loc[row, i] [i, row]  0.3091 nan
matrix.loc[col, i] [i, col]  0.3399 nan


col-i  Bst Mlu
row-i  Bsu Mlu
matrix.loc[row, i] [i, row]  0.2326 nan
matrix.loc[col, i] [i, col]  0.2058 nan


                Lvi     Amo     Mlu  ('Bsu', 'Bst')
Lvi             NaN  0.2795  0.3943             NaN
Amo             NaN     NaN  0.4289             NaN
Mlu             NaN     NaN     NaN             NaN
('Bsu', 'Bst')  NaN     NaN     NaN             NaN
4
new_target  ('Bsu', 'Bst'

IndexError: list index out of range

In [111]:
matrix2 = pd.read_csv("../matrice_test.csv", index_col=0)

dist_min, row_col = search_min(matrix2, "(Bst, Bsu)")

dict_dist = dict()
fusion_seq(matrix2, dict_dist)
print(dict_dist)

calc_dist(matrix2, row_col)

            Lvi     Amo     Mlu  (Bst, Bsu)
Lvi         NaN  0.2795  0.3943      0.2569
Amo         NaN     NaN  0.4289      0.3245
Mlu         NaN     NaN     NaN      0.2192
(Bst, Bsu)  NaN     NaN     NaN         NaN
{('Mlu', '(Bst, Bsu)'): 0.2192}
col-i  (Bst, Bsu) Lvi
row-i  Mlu Lvi
matrix.loc[row, i] [i, row]  nan 0.3943
matrix.loc[col, i] [i, col]  nan 0.2569


col-i  (Bst, Bsu) Amo
row-i  Mlu Amo
matrix.loc[row, i] [i, row]  nan 0.4289
matrix.loc[col, i] [i, col]  nan 0.3245


                       Lvi     Amo  ('Mlu', '(Bst, Bsu)')
Lvi                    NaN  0.2795                    NaN
Amo                    NaN     NaN                    NaN
('Mlu', '(Bst, Bsu)')  NaN     NaN                    NaN


"('Mlu', '(Bst, Bsu)')"

In [112]:
matrix = pd.read_csv("../matrice_dist_UPGMA.csv", index_col=0)

dist_min, row_col = search_min(matrix)

dict_dist = dict()
fusion_seq(matrix, dict_dist)
print(dict_dist)

calc_dist(matrix, row_col)

     Bsu     Bst     Lvi     Amo     Mlu
Bsu  NaN  0.1715  0.2147  0.3091  0.2326
Bst  NaN     NaN  0.2991  0.3399  0.2058
Lvi  NaN     NaN     NaN  0.2795  0.3943
Amo  NaN     NaN     NaN     NaN  0.4289
Mlu  NaN     NaN     NaN     NaN     NaN
{('Bsu', 'Bst'): 0.1715}
col-i  Bst Lvi
row-i  Bsu Lvi
matrix.loc[row, i] [i, row]  0.2147 nan
matrix.loc[col, i] [i, col]  0.2991 nan


col-i  Bst Amo
row-i  Bsu Amo
matrix.loc[row, i] [i, row]  0.3091 nan
matrix.loc[col, i] [i, col]  0.3399 nan


col-i  Bst Mlu
row-i  Bsu Mlu
matrix.loc[row, i] [i, row]  0.2326 nan
matrix.loc[col, i] [i, col]  0.2058 nan


                Lvi     Amo     Mlu  ('Bsu', 'Bst')
Lvi             NaN  0.2795  0.3943          0.2569
Amo             NaN     NaN  0.4289          0.3245
Mlu             NaN     NaN     NaN          0.2192
('Bsu', 'Bst')  NaN     NaN     NaN             NaN


"('Bsu', 'Bst')"