In [None]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import leidenalg
import math
import igraph as ig
import matplotlib.pyplot as plt


In [None]:
def create_graph_and_partition(X, k=25):
    """
    Create a graph from nearest neighbors and find clusters using Leiden algorithm
    
    Parameters:
    -----------
    X : array-like
        The dimensionality reduced data
    k : int, default=25
        Number of nearest neighbors
        
    Returns:
    --------
    g : igraph.Graph
        The created graph
    partition : leidenalg.VertexPartition
        The partition result from Leiden algorithm
    """
    # Find nearest neighbors
    neighbors = NearestNeighbors(n_neighbors=k).fit(X)
    distances, indices = neighbors.kneighbors(X)

    # Build edge list with weighted edges
    edges = []
    weights = []
    num_points = X.shape[0]

    for i in range(num_points):
        for idx, j in enumerate(indices[i]):
            if i == j: 
                continue
            if (j, i) in edges:
                continue
            d = distances[i, idx]
            weight = math.exp(-d)
            edges.append((i, j))
            weights.append(weight)

    # Create an igraph Graph, add vertices and edges
    g = ig.Graph()
    g.add_vertices(num_points)
    g.add_edges(edges)

    # Set the edge attribute 'weight' for our weighted graph
    g.es['weight'] = weights

    # Find partition using Leiden algorithm
    partition = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition)
    print("Clusters:", partition)
    
    return g, partition


In [None]:
partition = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition)
print("Clusters:", partition)


In [None]:
df_z['leiden'] = partition.membership


In [None]:
df_z['gender'] = df_meta.loc[df_z[' patient_id'].values.tolist(), 'gender'].values
df_z['ajcc_pathologic_tumor_stage'] = df_meta.loc[df_z[' patient_id'].values.tolist(), 'ajcc_pathologic_tumor_stage'].values
df_z.head()


In [None]:
data_clusters = df_z['leiden'].astype(str) 
data_clusters_set = set(data_clusters.values) 
zip_types = zip(sorted(data_clusters_set),range(len(data_clusters_set))) 
dict_types = dict(zip_types) 
cell_clusters  =  [dict_types[i] for i in data_clusters.values] 


In [None]:
# check df_z for nas 
df_z.fillna(0, inplace=True)
np_z = df_z.to_numpy()
np_z = np.delete(np_z, [0, 1], axis=1)
np_z = np.delete(np_z, [-2, -1], axis=1)


In [None]:
data_clusters = df_log['leiden'].astype(str)
data_clusters_set = set(data_clusters.values)
zip_types = zip(sorted(data_clusters_set),range(len(data_clusters_set)))
dict_types = dict(zip_types)
cell_clusters  =  [dict_types[i] for i in data_clusters.values] 


In [None]:

scatter = plt.scatter(X[:,0], X[:,1], c=cell_clusters, cmap='Set1')
legend1 = plt.legend(handles=scatter.legend_elements()[0], labels=dict_types.keys(), title="AJCC Pathologic Tumor Stage", loc="upper right", fontsize='small', title_fontsize='small')
plt.gca().add_artist(legend1)
