In [None]:
# -*- coding: UTF-8 -*-
import os,re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
# 这两行代码解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.cm as cm
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score # CH 指标，DBI 
from sklearn.metrics import silhouette_samples, silhouette_score # 轮廓系数
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cdist#cdist(XA, XB, 'correlation')
from dcor import distance_correlation as dcorr
from scipy.stats import pearsonr
#    1）输入：x为特征，y为目标变量.
#    2）输出：r： 相关系数 [-1，1]之间，p-value: p值。
#         注： p值越小，表示相关系数越显著，一般p值在500个样本以上时有较高的可靠性。
import time
import warnings
warnings.filterwarnings("ignore")
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')#日志只显示error级别的错误

from pckmeans import PCKMeans
from mpckmeans import MPCKMeans

##my
from labels2color import Labels2Color
#from topological_overlap_measure import TOMsimilarity
from topological_overlap_measure import *
#from soothsayer.networks import topological_overlap_measure

from my_functions import *

# cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#
mldir = "E:/Project/Project001 WGCNA/main/step-6-FactorAnalyzer/outdir"
data_indir = "E:/Project/Project001 WGCNA/main/step-3-FeatureSelection/outdir"
labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir"
out_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir"
dcorr_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/Calculate the correlation coefficient/outdir"

distance = "dcorr" #"pearson" #
power = 20
filename = 'LUAD.tcga_gtex.tpm.updown.feature_selection.csv'
# filename = 'BLCA.tumor.tpm.updown.feature_selection.csv'
prefix = filename.split('.')[0]

mldir = '/'.join([mldir, filename.split('.')[1]])
labelsdir = '/'.join([labelsdir, filename.split('.')[1]])
# outdir = os.path.join(outdir, filename.split('.'))
outdir = '/'.join([out_dir, filename.split('.')[1], distance])
dcorr_dir = '/'.join([dcorr_dir, filename.split('.')[1]])

if not os.path.exists(os.path.join(outdir, prefix)):
    os.makedirs(os.path.join(outdir, prefix))
    
assess = dict(wgcna = dict(SI = 0, CH = 0, DBI = 0, module_density_max = 0)
              , pckmeans = dict(SI = 0, CH = 0, DBI = 0, module_density_max = 0))

In [None]:
##########################################
### 构建GCN网络计算dissTOM矩阵 ###
##########################################
if distance == "pearson":
    data = pd.read_csv(os.path.join(data_indir, prefix, filename), sep=',', index_col=0)
    print("Input: data.shape", data.shape)
    print("NAN check for data: ", data.isnull().values.sum())
    sim = data.T.corr()#dcor## # 计算pearson相关系数，得到相似矩阵
elif distance == "snn":
    data = pd.read_csv(os.path.join(data_indir, prefix, filename), sep=',', index_col=0)
    print("Input: data.shape", data.shape)
    print("NAN check for data: ", data.isnull().values.sum())
    sim = snn_sim_matrix(data, k=5)
elif distance == "dcorr":
    sim = pd.read_csv(os.path.join(dcorr_dir, prefix, filename.rstrip('.csv')+".dcorr.csv"), sep=',')
    sim.index = sim.columns
else:
    print("input parameter error：Undefined distance type.")

print("NAN check: ", sim.isnull().values.sum())
#先删除全是空值的行和列
if sim.isnull().values.ravel().sum():
    sim.drop(sim.columns[sim.isnull().all(axis=1)].tolist(), inplace=True)#删除全是NAN的行
    sim = sim[sim.columns[~sim.isnull().all(axis=0)].tolist()]#删除全是NAN的列
    #再用0填充NAN
    sim.fillna(0, inplace=True)#用0填补缺失值
print("After processing NA: sim.shape", sim.shape)
sim.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".sim.csv"), index=False, sep=',')


# 调用R script计算软阈值
sft = pickSoftThreshold(sim, power=power, RsquaredCut = 0.75, prefix=prefix, outdir=outdir)
print('powerEstimate:%s' % sft["powerEstimate"])
sft["fitIndices"].to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+"sft_fitIndices.csv"), index=False, sep=',')


#Check scale free topology
# Create an adjacency network； ^6 近似无标度化
# here we define the adjacency matrix using soft thresholding with beta=6
ADJ = abs(sim)**sft["powerEstimate"] # 近似无标度化
print("Input: ADJ", "Max: {}".format(ADJ.values.ravel().max()), "Min: {}".format(ADJ.values.ravel().min()), sep="\n")
print("NAN check: ", ADJ.isnull().values.sum())
ADJ.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".ADJ.csv"), index=False, sep=',')

#计算邻接性
Connectivity = ADJ.values
row, col = np.diag_indices_from(Connectivity)
Connectivity[row, col] = 0
Connectivity = pd.DataFrame(Connectivity, columns=ADJ.columns)
Connectivity.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Connectivity.csv"), index=False, sep=',')

# Compute topological overlap
#from topological_overlap_measure import TOMsimilarity
from topological_overlap_measure import *
#from soothsayer.networks import topological_overlap_measure
TOM = TOMsimilarity(ADJ)#把邻接矩阵转换为拓扑重叠矩阵topological_overlap_measure
dissTOM = 1 - TOM

TOM.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".TOM.csv"), index=False, sep=',')
dissTOM.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".dissTOM.csv"), index=False, sep=',')

X = dissTOM.copy()


try:
    import umap.umap_ as umap
    X_Dim = umap.UMAP(n_components=3).fit_transform(X)
    # 使用TSNE进行降维处理
    # X_Dim = TSNE(n_components=3, learning_rate=100, random_state=0).fit_transform(X)
    
except Exception as e:
    print("\n==============================")
    print("Data dimension reduction failed.\n")
    print(e)
    print("==============================\n")
    # X_Dim = TSNE(n_components=2, learning_rate=100, random_state=0).fit_transform(ADJ)
    X_Dim = umap.UMAP(n_components=3).fit_transform(ADJ)


In [None]:
##########################################
### 可视化WGCNA聚类结果作为参考 ###
##########################################
print("WGCNA==========>")
method = 'WGCNA'
wgcna_y_pred = pd.read_csv(os.path.join(labelsdir, prefix, prefix+".wgcna.result.csv"), index_col=False)
wgcna_y_pred.index = wgcna_y_pred["gene"].values
wgcna_y_pred = wgcna_y_pred.loc[X.index]
wgcna_labels = wgcna_y_pred["dynamicMods"].values
#print("value_counts:\n", pd.DataFrame(pd.value_counts(wgcna_labels)).T)
labels_counts = pd.DataFrame(pd.value_counts(wgcna_labels))
labels_counts.insert(0, "clusters", labels_counts.index)
labels_counts.columns = ["clusters", "counts"]
labels_counts.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".%s_clustering.labels_counts.csv" % method), index=False)
pd.DataFrame(np.array([X.index, wgcna_labels]).T
             , columns=["gene", "labels"]
            ).to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".%s_clustering.cluster_labels.csv" % method), index=False)#, header=None

unique_labels = np.unique(wgcna_labels)
##分类个数：lables中包含-1，表示噪声点
n_clusters_ =len(np.unique(wgcna_labels)) - (1 if -1 in wgcna_labels else 0) 
print("Number of clusters with WGCNA:%d" % n_clusters_)
WGCNA_K = n_clusters_

SI = silhouette_score(X, wgcna_labels)
CH = calinski_harabasz_score(X, wgcna_labels)
DBI = davies_bouldin_score(X, wgcna_labels)
sample_silhouette_values = silhouette_samples(X, wgcna_labels)

assess['wgcna']["SI"] = SI
assess['wgcna']["CH"] = CH
assess['wgcna']["DBI"] = DBI
print("For n_clusters =", n_clusters_,
        "\nThe average silhouette_score is :", SI)
print("Calinski-Harabasz Score",  CH,
         "\nDavies Bouldin score is :", DBI)


plt.clf() # 使用 plt.clf() 清理掉 axes
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
plt.rcParams.update({'font.family': 'Times New Roman'})
plt.rcParams.update({'font.weight': 'normal'})
plt.rcParams.update({'font.size': 20})

ax1=fig.add_subplot(121)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (n_clusters_ + 1) * 10])


y_lower = 10
for i in range(n_clusters_):
    ith_cluster_silhouette_values = sample_silhouette_values[wgcna_labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = cm.nipy_spectral(float(i)/n_clusters_)
    ax1.fill_betweenx(np.arange(y_lower, y_upper)
                     ,ith_cluster_silhouette_values
                     ,facecolor=color
                     ,alpha=0.7
                     )
    ax1.text(-0.05
             , y_lower + 0.5 * size_cluster_i
             , str(i))
    y_lower = y_upper + 10
ax1.set_title("The Silhouette plot for the various clusters.", fontsize=18)
ax1.set_xlabel("The Silhouette coefficient values", fontsize=16)
ax1.set_ylabel("Cluster label", fontsize=16)
ax1.axvline(x=SI, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
#colors = cm.nipy_spectral(wgcna_labels.astype(float) / n_clusters_)
colors = Labels2Color(X, wgcna_labels)['colorcode']
ax2 = fig.add_subplot(1, 2, 2, projection='3d') 
ax2.scatter3D(X_Dim[:, 0], X_Dim[:, 1],X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors
           )

ax2.set_title("The visualization of the clustered data.", fontsize=18)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16)
ax2.set_zlabel("z", fontsize=20)
plt.suptitle(("Silhouette analysis for WGCNA clustering on %s data with n_clusters = %d" % (prefix, WGCNA_K)),
             fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".Silhouette analysis for WGCNA clustering on %s data.png" % prefix)
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".Silhouette analysis for WGCNA clustering on %s data.pdf" % prefix)
            , bbox_inches='tight')
plt.close()

###画wgcna的基因数量图、模块平均连通性图和模块密度图###

#每个模块的基因数条形图
width = 0.35# 柱子的宽度
fig = plt.figure(figsize=(6, 6), dpi=100)# 创建画布, 并设置分辨率为 80像素/每英寸
# plt.clf() # 使用 plt.clf() 清理掉 axes
plt.subplot(111)# 创建一个子图
#plt.title("Number of genes in the module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Number of genes", fontsize=16)
plt.bar(labels_counts.loc[:, "clusters"], labels_counts.loc[:, "counts"], width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Number of genes in the module.png"), dpi=1080, bbox_inches='tight')
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Number of genes in the module.pdf"), bbox_inches='tight')
plt.close()


#模块内平均连通性
values = []
for k in np.sort(np.unique(wgcna_labels)):      
    values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/sum(wgcna_labels==k))
width = 0.35# 柱子的宽度
fig = plt.figure(figsize=(6, 6), dpi=100)# 创建画布, 并设置分辨率为 80像素/每英寸
# plt.clf() # 使用 plt.clf() 清理掉 axes
plt.subplot(111)# 创建一个子图
plt.title("Mean Connectivity of module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Mean Connectivity", fontsize=16)
plt.bar(np.sort(np.unique(wgcna_labels)), values, width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Mean_Connectivity_of_module.png"), dpi=1080, bbox_inches='tight')
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Mean_Connectivity_of_module.pdf"), bbox_inches='tight')
plt.close()

#模块密度
values = []
for k in np.sort(np.unique(wgcna_labels)):      
    values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/(sum(wgcna_labels==k)*(sum(wgcna_labels==k)-1)))
assess['wgcna']['module_density_max'] = max(values)
width = 0.35# 柱子的宽度
fig = plt.figure(figsize=(6, 6), dpi=100)# 创建画布, 并设置分辨率为 80像素/每英寸
# plt.clf() # 使用 plt.clf() 清理掉 axes
plt.subplot(111)# 创建一个子图
#plt.title("Density of module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Density of module", fontsize=16)
plt.bar(np.sort(np.unique(wgcna_labels)), values, width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.density_of_module.png"), dpi=1080, bbox_inches='tight')
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.density_of_module.pdf"), bbox_inches='tight')

plt.close()

In [None]:
##########################################
###计算PCKMeans参数K的学习曲线###
##########################################
print("Calculate the learning curve for K ==========>")

score = {'SI': [], 'CH': [], 'DBI': []}
# 读入约束条件，构建成对约束
df = open(os.path.join(mldir, prefix, prefix+".ml.txt"),'r')
geneslist = df.readline().split("\t")
byt = df.readlines()
df.close()
geneslist.pop()#去掉最后的换行符
ml = set()
for index,value in enumerate(byt):
    temp = value.split("\t")
    temp.pop()#默认删除最后一个元素，并返回值
    #temp.remove("\n")#删除元素temp
    byt[index] = temp
    for i in range(len(temp)):
        for j in range(i+1, len(temp)):
            ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
print("Constraints in pairs gene number:%d" % len(ml))

K_sequence = range(2, math.ceil(WGCNA_K*1.2), 1) #range(6, 11, 1)
print("K = ", list(K_sequence))
for k in K_sequence:
    print("---->K = %d" % k)
    try:
        ### 测试PCKMeans聚类效果 ###
        t1 = time.time()
        pckm = PCKMeans(n_clusters=k) #, distance_type='euclidean'
        pckm.fit(np.array(X), ml=ml)
        t2 = time.time()
        #print ("the time of clustering is %.5fs" % (t2 - t1))
        pckm_labels = pckm.labels_ 
        #print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)     
        n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)
        
        SI = silhouette_score(X, pckm_labels)
        CH = calinski_harabasz_score(X, pckm_labels)
        DBI = davies_bouldin_score(X, pckm_labels)
        score["SI"].append(SI)
        score["CH"].append(CH)
        score["DBI"].append(DBI)
        print("The average silhouette_score is :%.4f" % SI)
        # print("For n_clusters =", n_clusters_, "\nThe average silhouette_score is :", SI)
        # print("Calinski-Harabasz Score",  CH, "\nDavies Bouldin score is :", DBI)

    except Exception as e:
        print("\n==============================")
        print("    Error processing %s" % k)
        print("    ", e)
        score["SI"].append(0)
        score["CH"].append(0)
        score["DBI"].append(0)

        print("==============================\n")
        continue

default_K = K_sequence[score["SI"].index(np.max(score["SI"]))]
print('The optimal K value:%d' % default_K)
xticks = list(K_sequence)

pd.DataFrame.from_dict({'K_sequence':list(K_sequence) + [WGCNA_K]
                        ,'SI':score["SI"] + [assess['wgcna']["SI"]]
                        ,"CH":score["CH"] + [assess['wgcna']["CH"]]
                        ,"DBI":score["DBI"] + [assess['wgcna']["DBI"]]
                        ,'type':['pckmeans']*len(K_sequence)+['wgcna']}
               ).to_csv(os.path.join(outdir, prefix, "%s.K_learn-curve.csv" % filename.rstrip('.csv')), index=False, sep=',')



###################################################
### Visualization of the learning curve  ###
###################################################
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(18, 6)

#xticks = range(len(score_pckmeans["SI"][prefix_index]))# K_sequence #

ax1.plot(xticks, score["SI"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax1.scatter(WGCNA_K, assess['wgcna']["SI"], s=100, c='r', marker='*',alpha=0.8)
ax1.set_xlabel('K', fontdict={'size': 16})
ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
ax1.set_title("SI", fontdict={'size': 18})
ax1.legend(loc='best')

ax2.plot(xticks, score["CH"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax2.scatter(WGCNA_K, assess['wgcna']["CH"], s=100, c='r', marker='*',alpha=0.8)
ax2.set_xlabel('K', fontdict={'size': 16})
ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
ax2.set_title("CH", fontdict={'size': 18})
ax2.legend(loc='best')

ax3.plot(xticks, score["DBI"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax3.scatter(WGCNA_K, assess['wgcna']["DBI"], s=100, c='r', marker='*',alpha=0.8)
ax3.set_xlabel('K', fontdict={'size': 16})
ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
ax3.set_title("DBI", fontdict={'size': 18})
ax3.legend(loc='best')

plt.suptitle(("Learning curve for K values of ScC-WGCNA clustering on %s data" % prefix), fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, prefix, "%s.Learning curve for K values of ScC-WGCNA clustering on %s data.pdf" % (filename.rstrip('.csv'), filename.split('.')[0])), bbox_inches='tight')#
fig.savefig(os.path.join(outdir, prefix, "%s.Learning curve for K values of ScC-WGCNA clustering on %s data.png" % (filename.rstrip('.csv'), filename.split('.')[0])), dpi=1080, bbox_inches='tight')#
plt.close()



In [None]:
##############################################################################################################################
### 对m种聚类算法用三种外部聚类评估指标进行对比评估 以学习曲线选取最佳K值设置K 可以设置共表达网络构建所用的距离度量方法 ###
##############################################################################################################################
print("PCKMeans==========>")

K = 4#default_K if default_K>3 else K_sequence[score["SI"].index(np.max(score["SI"][1:]))]
print(K)
# 读入约束基因集，构建成对约束
df = open(os.path.join(mldir, prefix, prefix+".ml.txt"),'r')
geneslist = df.readline().split("\t")
geneslist.pop()#去掉最后的换行符
ml = set()
byt = df.readlines()
for index,value in enumerate(byt):
    temp = value.split("\t")
    temp.pop()#默认删除最后一个元素，并返回值
    #temp.remove("\n")#删除元素temp
    byt[index] = temp
    for i in range(len(temp)):
        for j in range(i+1, len(temp)):
            ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
print("Number of pairwise constraints:%d\n" % len(ml))


### PCKMeans聚类 ###

t1 = time.time()
pckm = PCKMeans(n_clusters=K)#, distance_type='euclidean'
pckm.fit(np.array(X), ml=ml)#
t2 = time.time()
print ("the time of clustering is %.5fs" % (t2 - t1))
pckm_labels = pckm.labels_


print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)
labels_counts = pd.DataFrame(pd.value_counts(pckm_labels))
labels_counts.insert(0, "clusters", labels_counts.index)
labels_counts.columns = ["clusters", "counts"]
labels_counts.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".PCKMeans_clustering.labels_counts.csv"), index=False)
pd.DataFrame(np.array([X.index, pckm_labels]).T, columns=["gene", "labels"]).to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".PCKMeans_clustering.cluster_labels.csv"), index=False)#, header=None
        

unique_labels = np.unique(pckm_labels)
n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)

#每个模块的基因数条形图
width = 0.35# 柱子的宽度
# plt.clf() # 使用 plt.clf() 清理掉 axes
fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
plt.subplot(111)# 创建一个子图
plt.title("Number of genes in the module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Number of genes", fontsize=16)
plt.bar(labels_counts.loc[:, "clusters"], labels_counts.loc[:, "counts"], width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Number of genes in the module.png")
            , bbox_inches='tight', dpi=1080)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Number of genes in the module.pdf")
            , bbox_inches='tight')  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
pd.DataFrame.from_dict({"clusters":labels_counts.loc[:, "clusters"]
                        , "counts":labels_counts.loc[:, "counts"]
                       }).to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Number of genes in the module.csv")
                                                                           , index=False, sep=',')

plt.close()

#模块内平均连通性
values = []
for k in np.sort(np.unique(pckm_labels)):      
    values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/sum(pckm_labels==k))
width = 0.35# 柱子的宽度
fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
# plt.clf() # 使用 plt.clf() 清理掉 axes
plt.subplot(111)# 创建一个子图
plt.title("Mean Connectivity of module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Mean Connectivity", fontsize=16)
plt.bar(np.sort(np.unique(pckm_labels)), values, width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Mean_Connectivity_of_module.png")
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Mean_Connectivity_of_module.pdf")
            , bbox_inches='tight')
pd.DataFrame.from_dict({'pckm_labels':np.sort(np.unique(pckm_labels))
                        , 'values':values}).to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Mean_Connectivity_of_module.csv")
                                                   , index=False, sep=',')
plt.close()

#模块密度
values = []
for k in np.sort(np.unique(pckm_labels)):      
    values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/(sum(pckm_labels==k)*(sum(pckm_labels==k)-1)))
assess['pckmeans']['module_density_max'] = max(values)
width = 0.35# 柱子的宽度
fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
# plt.clf() # 使用 plt.clf() 清理掉 axes
plt.subplot(111)# 创建一个子图
plt.title("Density of module", fontsize=18)
plt.xlabel("Module", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.bar(np.sort(np.unique(pckm_labels)), values, width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2

fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".density_of_module.png"), dpi=1080, bbox_inches='tight')
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".density_of_module.pdf"), bbox_inches='tight')
pd.DataFrame.from_dict({'pckm_labels':np.sort(np.unique(pckm_labels))
                        , 'values':values}).to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".density_of_module.csv")
                                                   , index=False, sep=',')
plt.close()

# 参数评估
SI = silhouette_score(X, pckm_labels)
CH = calinski_harabasz_score(X, pckm_labels)
DBI = davies_bouldin_score(X, pckm_labels)
assess['pckmeans']["SI"] = SI
assess['pckmeans']["CH"] = CH
assess['pckmeans']["DBI"] = DBI
print("For n_clusters =", n_clusters_,
        "\nThe average silhouette_score is :", SI)
print("Calinski-Harabasz Score",  CH,
         "\nDavies Bouldin score is :", DBI)

sample_silhouette_values = silhouette_samples(X, pckm_labels)

# Silhouette analysis for clustering
# plt.clf() # 使用 plt.clf() 清理掉 axes
fig = plt.figure(figsize=(16, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7) 
plt.rcParams.update({'font.family': 'Times New Roman'})
plt.rcParams.update({'font.weight': 'normal'})
plt.rcParams.update({'font.size': 20})

ax1=fig.add_subplot(121)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (n_clusters_ + 1) * 10])

y_lower = 10
for i in range(n_clusters_):
    ith_cluster_silhouette_values = sample_silhouette_values[pckm_labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = cm.nipy_spectral(float(i)/n_clusters_)
    ax1.fill_betweenx(np.arange(y_lower, y_upper)
                     ,ith_cluster_silhouette_values
                     ,facecolor=color
                     ,alpha=0.7
                     )
    ax1.text(-0.05
             , y_lower + 0.5 * size_cluster_i
             , str(i))
    y_lower = y_upper + 10
ax1.set_title("The Silhouette plot for the various clusters.", fontsize=18)
ax1.set_xlabel("The Silhouette coefficient values", fontsize=16)
ax1.set_ylabel("Cluster label", fontsize=16)
ax1.axvline(x=SI, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
#colors = cm.nipy_spectral(pckm_labels.astype(float) / n_clusters_)
colors = Labels2Color(X, pckm_labels)['colorcode']
ax2 = fig.add_subplot(1, 2, 2, projection='3d')
ax2.scatter3D(X_Dim[:, 0], X_Dim[:, 1],X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors
           )

ax2.set_title("The visualization of the clustered data.", fontsize=18)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16, rotation=38) #, rotation=38 y 轴名称旋转 38 度
ax2.set_zlabel('Z')
plt.suptitle(("Silhouette analysis for ScC-WGCNA clustering on %s data with n_clusters = %d" % (prefix, n_clusters_)),
             fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Silhouette analysis for ScC-WGCNA clustering on %s data.png" % prefix)
            , bbox_inches='tight', dpi=1080)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Silhouette analysis for ScC-WGCNA clustering on %s data.pdf" % prefix)
            , bbox_inches='tight')  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
plt.close()
pd.DataFrame.from_dict(assess).to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".ScC-WGCNA_vs_WGCNA.assess.csv"), sep=',',index=True)

In [None]:
###########################################################
### The visualization of the clustered data ###
###########################################################

plt.clf() # 使用 plt.clf() 清理掉 axes
# set up a figure twice as wide as it is tall
fig = plt.figure(figsize=(16, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7) 
plt.rcParams.update({'font.family': 'Times New Roman'})
plt.rcParams.update({'font.weight': 'normal'})
plt.rcParams.update({'font.size': 20})

colors1 = Labels2Color(X, wgcna_labels)['colorcode']
# set up the axes for the first plot
ax1.axis('off')
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
ax1.scatter3D(X_Dim[:, 0], X_Dim[:, 1], X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors1
           )
ax1.set_title("WGCNA with n_clusters = %d" % WGCNA_K, fontsize=18)
ax1.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax1.set_ylabel("Feature space for the 2nd feature", fontsize=16)
ax1.set_zlabel("z", fontsize=16)

# ax1.spines['top'].set_visible(False)
# ax1.spines['right'].set_visible(False)
# ax1.spines['bottom'].set_visible(False)
# ax1.spines['left'].set_visible(False)

colors2 = Labels2Color(X, pckm_labels)['colorcode']
# set up the axes for the first plot
ax2.axis('off')
ax2 = fig.add_subplot(1, 2, 2, projection='3d')

ax2.scatter3D(X_Dim[:, 0], X_Dim[:, 1], X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors2
           )
ax2.set_title("PCKMeans with n_clusters = %d" % K, fontsize=18)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16)
ax2.set_zlabel("z", fontsize=16)
# ax2.spines['top'].set_visible(False)
# ax2.spines['right'].set_visible(False)
# ax2.spines['bottom'].set_visible(False)
# ax2.spines['left'].set_visible(False)
plt.suptitle(("The visualization of %s data clustering results" % prefix),
             fontsize=20, fontweight='bold')
fig = plt.gcf()

fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.png" % prefix)
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.pdf" % prefix)
            , bbox_inches='tight')
# plt.show()
plt.close()

### summary

In [None]:
mldir = "E:/Project/Project001 WGCNA/main/step-6-FactorAnalyzer/outdir"
data_indir = "E:/Project/Project001 WGCNA/main/step-3-FeatureSelection/outdir"
labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir"
out_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir"
dcorr_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/Calculate the correlation coefficient/outdir"

distance = "dcorr" #"pearson" #

filename = '.tcga_gtex.tpm.updown.feature_selection.csv'
# filename = '.tumor.tpm.updown.feature_selection.csv'
prefix = filename.split('.')[0]

mldir = '/'.join([mldir, filename.split('.')[1]])
labelsdir = '/'.join([labelsdir, filename.split('.')[1]])
# outdir = os.path.join(outdir, filename.split('.'))
outdir = '/'.join([out_dir, filename.split('.')[1], distance])
dcorr_dir = '/'.join([dcorr_dir, filename.split('.')[1]])

if not os.path.exists(os.path.join(outdir, prefix)):
    os.makedirs(os.path.join(outdir, prefix))
    

cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#
distance_metrics = ["euclidean", "snn", "dcorr", "pearson"]

In [None]:
########################
####### summary  #######
########################

assess = dict(wgcna = dict(SI = {}, CH = {}, DBI = {}, module_density_max = {})
              , pckmeans = dict(SI = {}, CH = {}, DBI = {}, module_density_max = {}))
   

for cancer in cancer_names:
    prefix = cancer
    temp = pd.read_csv(os.path.join(outdir, prefix, prefix+filename.rstrip('.csv')+".ScC-WGCNA_vs_WGCNA.assess.csv"), sep=',', index_col=0)
    temp.to_dict()
    for key1 in ['wgcna', 'pckmeans']:
        for key2 in ['SI', 'CH', 'DBI', 'module_density_max']:
            assess[key1][key2][cancer] = temp[key1][key2]

module_density_max = {}
module_density_max['WGCNA'] = assess['wgcna']['module_density_max']
module_density_max['ScC-WGCNA'] = assess['pckmeans']['module_density_max']
moduleDensity_max = round(pd.DataFrame.from_dict(module_density_max, orient= 'index'), 4)
moduleDensity_max.to_csv(os.path.join(outdir, distance+".module_density_max.csv"), index=True)


ml_summary = {'Number of gene':{}, 'Number of gene pair':{}, 'Number of constrained gene':{}}
for cancer in cancer_names:
    prefix = cancer
    # 读入约束基因集，构建成对约束
    df = open(os.path.join(mldir, prefix, prefix+".ml.txt"),'r')
    geneslist = df.readline().split("\t")
    geneslist.pop()#去掉最后的换行符
    ml_summary['Number of gene'][prefix] = len(geneslist)
    
    ml = set()
    byt = df.readlines()
    for index,value in enumerate(byt):
        temp = value.split("\t")
        temp.pop()#默认删除最后一个元素，并返回值
        #temp.remove("\n")#删除元素temp
        byt[index] = temp
        for i in range(len(temp)):
            for j in range(i+1, len(temp)):
                ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
    ml_summary['Number of gene pair'][prefix] = len(ml)
    print("Number of pairwise constraints:%d\n" % len(ml))
    
    temp = []
    for u,v in ml:
        temp += [u,v]
    ml_summary['Number of constrained gene'][prefix] = round(len(set(temp))/len(geneslist), 4)
    print("Number of constrained gene:%d\n" % len(set(temp)))

pd.DataFrame.from_dict(ml_summary, orient= 'index').to_csv(os.path.join(outdir, distance+".summary of pairwise constraints.csv"), index=True)

### The visualization of the  evaluation score

In [None]:
###########################################################
### The visualization of the  evaluation score  ###
###########################################################
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(24, 8)
# plt.rcParams['xtick.labelsize']=12
# plt.rcParams['ytick.labelsize']=12

x = cancer_names #range(len(score_wgcna["SI"]))# 
ax1.plot(assess['wgcna']["SI"].keys(), assess['wgcna']["SI"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax1.plot(assess['pckmeans']["SI"].keys(), assess['pckmeans']["SI"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax1.set_xlabel('datasets', fontdict={'size': 16})
ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
ax1.set_title("SI", fontdict={'size': 18})
ax1.legend(loc='best', fontsize = 12)
ax1.set_xticklabels(assess['wgcna']["SI"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax1.get_yticklabels(), fontsize=12)

ax2.plot(assess['wgcna']["CH"].keys(), assess['wgcna']["CH"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax2.plot(assess['pckmeans']["CH"].keys(), assess['pckmeans']["CH"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax2.set_xlabel('datasets', fontdict={'size': 16})
ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
ax2.set_title("CH", fontdict={'size': 18})
ax2.legend(loc='best', fontsize = 12)
ax2.set_xticklabels(assess['wgcna']["CH"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax2.get_yticklabels(), fontsize=12)

ax3.plot(assess['wgcna']["DBI"].keys(), assess['wgcna']["DBI"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax3.plot(assess['pckmeans']["DBI"].keys(), assess['pckmeans']["DBI"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax3.set_xlabel('datasets', fontdict={'size': 16})
ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
ax3.set_title("DBI", fontdict={'size': 18})
ax3.legend(loc='best', fontsize = 12)
ax3.set_xticklabels(assess['wgcna']["DBI"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax3.get_yticklabels(), fontsize=12)

plt.suptitle(("SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data"), fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data.png")
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data.pdf"), bbox_inches='tight')
plt.show()

In [None]:
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
# 这两行代码解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.cm as cm
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score # CH 指标，DBI 
from sklearn.metrics import silhouette_samples, silhouette_score # 轮廓系数
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cdist#cdist(XA, XB, 'correlation')
from dcor import distance_correlation as dcorr
from scipy.stats import pearsonr
#    1）输入：x为特征，y为目标变量.
#    2）输出：r： 相关系数 [-1，1]之间，p-value: p值。
#         注： p值越小，表示相关系数越显著，一般p值在500个样本以上时有较高的可靠性。
import time
import warnings
warnings.filterwarnings("ignore")
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')#日志只显示error级别的错误

from pckmeans import PCKMeans
from mpckmeans import MPCKMeans

##my
from labels2color import Labels2Color
#from topological_overlap_measure import TOMsimilarity
from topological_overlap_measure import *
#from soothsayer.networks import topological_overlap_measure

from my_functions import *


mldir = "E:/Project/Project001 WGCNA/main/step-6-FactorAnalyzer/outdir/"
data_indir = "E:/Project/Project001 WGCNA/main/step-3-FeatureSelection/outdir/"
labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir/"
out_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir/"
dcorr_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/data/dcorr/"

cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#
distance_metrics = ["euclidean", "snn", "dcorr", "pearson"]

score_wgcna = dict(SI = [], CH = [], DBI = [])
score_pckmeans = dict(SI = [], CH = [], DBI = [])
#score_kmeans = dict(SI = [], CH = [], DBI = [])

distance = "dcorr" #"pearson" # 
outdir = out_dir+distance+"/"

powers = {}
# if distance=="dcorr":
#     powers = {"KIRC": 3, "LIHC": 3, "LUSC": 3, "OV": 3, "PAAD":3, "STAD": 3}
# elif distance=="pearson":
#     powers = {"KIRC": 3, "LIHC": 4, "LUSC": 3, "OV": 2, "PAAD":3, "STAD": 7}
# else:
#     powers = {"KIRC": 6, "LIHC": 6, "LUSC": 6, "OV": 6, "PAAD":6, "STAD": 6}


### 调用R script计算软阈值power

</br>
def pickSoftThreshold(data, power=20, dataIsExpr=False, prefix="01", outdir=".", verbose = 5):</br>
    #=====================</br>
    #function 调用R script计算软阈值power</br>
    #=====================</br>
    import rpy2.robjects as robjects  # 导入R对象</br>
    from rpy2.robjects import pandas2ri</br>
    pandas2ri.activate()</br>
    #加载R包和R函数</br>
    robjects.r["source"]('D:\\xiaogy\\WGCNA.main\\step-7-clustering\\semi_supervised Kmeans\\step-7-0-pickSoftThreshold.R')</br>
    powers = robjects.IntVector(list(range(1, power+1)))</br>
    similarExpr = robjects.r['data.matrix'](data) #把data转变成矩阵 </br>
    sft = robjects.r['pickSoftThreshold'](similarExpr, dataIsExpr = dataIsExpr, powerVector = powers, prefix=prefix, outdir=outdir, verbose =verbose)#</br>
    sft = {"powerEstimate": sft.rx2("powerEstimate")[0], "fitIndices": pd.DataFrame(sft.rx2("fitIndices"), index=sft.rx2("fitIndices").names).T}</br>
    return(sft)</br>
</br>

prefix =  "KIRC"</br>
distance = "dcorr"</br>
sim = pd.read_csv(dcorr_dir+prefix+"/"+prefix+".TPM.updown.feature_selection.dcorr.csv", sep=',')</br>
sim.index = sim.columns</br>

power = 20</br>
sft = pickSoftThreshold(sim, power=power, prefix=prefix, outdir=outdir)</br>
#print(sft)<\br>
powers[prefix] = sft["powerEstimate"]</br>
sft["fitIndices"].to_csv(outdir+prefix+"/"+prefix+"."+"sft_fitIndices.csv", index=False, sep=',')</br>

### 画n个癌症的聚类K值的学习曲线选取最佳K ###

In [None]:
### 画n个癌症的聚类K值的学习曲线选取最佳K ###
#先运算所有dataset再对学习曲线可视化

score_wgcna = dict(SI = {}, CH = {}, DBI = {})
score_pckmeans = dict(SI = {}, CH = {}, DBI = {})
#score_kmeans = dict(SI = {}, CH = {}, DBI = {})
xticks = {}
WGCNA_k = {}
default_K = {}
'''
K_sequence = {"LUSC": c(3, 4, 5)
              , "KIRC": c(3, 4, 5, 6, 7, 8)
              , "LIHC": range(math.floor(wgcna_k*0.4), math.ceil(wgcna_k*1.2), 1)
              , "STAD": 6
              , "OV": 3
              , "PAAD":5
             }
'''
# for prefix_index, prefix in enumerate(cancer_names):



for prefix_index, prefix in enumerate(cancer_names):
    prefix = prefix   
    print("###==================== %s ====================###\n" % prefix)

    
    if not os.path.exists(outdir+prefix):
        os.makedirs(outdir+prefix)
        
    try:
        
        data = pd.read_csv(data_indir+prefix+"/"+prefix+".tumor.tpm.updown.feature_selection.csv", sep=',', index_col=0)
        print("Input: data.shape", data.shape)
        print("NAN check for data: ", data.isnull().values.sum())

        if distance=="pearson":
            sim = data.T.corr()#dcor## # 计算相关系数，得到一个矩阵
        elif distance=="snn":   
            snn_sim = snn_sim_matrix(data, k=5)
        elif distance=="dcorr":
            sim = pd.read_csv(dcorr_dir+prefix+"/"+prefix+".tumor.tpm.updown.feature_selection.dcorr.csv", sep=',')
            sim.index = sim.columns
        else:
            print("input parameter error：Undefined distance type.")

        print("NAN check: ", sim.isnull().values.sum())
        #先删除全是空值的行和列
        if sim.isnull().values.ravel().sum():
            sim.drop(sim.columns[sim.isnull().all(axis=1)].tolist(), inplace=True)#删除全是NAN的行
            sim = sim[sim.columns[~sim.isnull().all(axis=0)].tolist()]#删除全是NAN的列
            #再用0填充NAN
            sim.fillna(0, inplace=True)#用0填补缺失值
        print("After processing NA: sim.shape", sim.shape)
        sim.to_csv(outdir+prefix+"/"+prefix+"."+distance+".sim.csv", index=False, sep=',')
        
        
        # 调用R script计算软阈值
        power = 20
        sft = pickSoftThreshold(sim, power=power, prefix=prefix, outdir=outdir)
        #print(sft)
        powers[prefix] = sft["powerEstimate"]
        sft["fitIndices"].to_csv(outdir+prefix+"/"+prefix+"."+"sft_fitIndices.csv", index=False, sep=',')

        
        #Check scale free topology
        # Create an adjacency network； ^6 近似无标度化
        # here we define the adjacency matrix using soft thresholding with beta=6
        ADJ = abs(sim)**powers[prefix] # 近似无标度化
        print("Input: ADJ", "Max: {}".format(ADJ.values.ravel().max()), "Min: {}".format(ADJ.values.ravel().min()), sep="\n")
        print("NAN check: ", ADJ.isnull().values.sum())
        ADJ.to_csv(outdir+prefix+"/"+prefix+"."+distance+".ADJ.csv", index=False, sep=',')
      
        # Compute topological overlap
        #from topological_overlap_measure import TOMsimilarity
        from topological_overlap_measure import *
        #from soothsayer.networks import topological_overlap_measure
        TOM = TOMsimilarity(ADJ)#把邻接矩阵转换为拓扑重叠矩阵topological_overlap_measure
        dissTOM = 1 - TOM

        TOM.to_csv(outdir+prefix+"/"+prefix+"."+distance+".TOM.csv", index=False, sep=',')
        dissTOM.to_csv(outdir+prefix+"/"+prefix+"."+distance+".dissTOM.csv", index=False, sep=',')
        '''
        dissTOM = pd.read_csv(outdir+prefix+"/"+prefix+"."+distance+".dissTOM.csv", sep=',')
        dissTOM.index = dissTOM.columns
        '''
        X = dissTOM
        
               
        df = open(mldir+prefix+"/"+prefix+".ml.txt",'r')
        geneslist = df.readline().split("\t")
        geneslist.pop()#去掉最后的换行符
        ml = set()
        byt = df.readlines()
        for index,value in enumerate(byt):
            temp = value.split("\t")
            temp.pop()#默认删除最后一个元素，并返回值
            #temp.remove("\n")#删除元素temp
            byt[index] = temp
            for i in range(len(temp)):
                for j in range(i+1, len(temp)):
                    ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
        print("Constraints in pairs gene number:%d" % len(ml))
        
        
        ### 可视化WGCNA聚类结果作为参考 ###
        
        wgcna_y_pred = pd.read_csv(labelsdir+prefix+"/"+prefix+".wgcna.result.csv", index_col=False)
        wgcna_y_pred.index = wgcna_y_pred["gene"].values
        wgcna_y_pred = wgcna_y_pred.loc[X.index]
        wgcna_labels = wgcna_y_pred["dynamicMods"].values
        #print("value_counts:\n", pd.DataFrame(pd.value_counts(wgcna_labels)).T)
        ##分类个数：lables中包含-1，表示噪声点
        n_clusters_ =len(np.unique(wgcna_labels)) - (1 if -1 in wgcna_labels else 0) 
        print("Number of clusters with WGCNA:%d" % n_clusters_)
        wgcna_k = n_clusters_
        WGCNA_k[prefix] = n_clusters_
        SI = silhouette_score(X, wgcna_labels)
        CH = calinski_harabasz_score(X, wgcna_labels)
        DBI = davies_bouldin_score(X, wgcna_labels)
        score_wgcna["SI"][prefix] = SI
        score_wgcna["CH"][prefix] = CH
        score_wgcna["DBI"][prefix] = DBI
        #print("For n_clusters =", n_clusters_, "\nThe average silhouette_score is :", SI)
        #print("Calinski-Harabasz Score",  CH, "\nDavies Bouldin score is :", DBI)
        
        ################################################
        score_pckmeans["SI"][prefix] = []
        score_pckmeans["CH"][prefix] = []
        score_pckmeans["DBI"][prefix] = []
        '''
        score_kmeans["SI"][prefix] = []
        score_kmeans["CH"][prefix] = []
        score_kmeans["DBI"][prefix] = []
        '''
        K_sequence = range(math.floor(wgcna_k*0.4), math.ceil(wgcna_k*1.2), 1) #range(6, 11, 1)
        for k in K_sequence:
            print("    K = ", k)
            try:
                ### 测试PCKMeans聚类效果 ###
                t1 = time.time()
                pckm = PCKMeans(n_clusters=k) #, distance_type='euclidean'
                pckm.fit(np.array(X), ml=ml)
                t2 = time.time()
                #print ("the time of clustering is %.5fs" % (t2 - t1))
                pckm_labels = pckm.labels_ 
                #print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)     
                n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)
                #pd.DataFrame(pckm_labels, index=X.index).to_csv(outdir+prefix+"/"+prefix+".PCKMeans_clustering"+".cluster_labels.k%d.csv" % n_clusters_, header=None, index=True)

                SI = silhouette_score(X, pckm_labels)
                CH = calinski_harabasz_score(X, pckm_labels)
                DBI = davies_bouldin_score(X, pckm_labels)
                score_pckmeans["SI"][prefix].append(SI)
                score_pckmeans["CH"][prefix].append(CH)
                score_pckmeans["DBI"][prefix].append(DBI)
                #print("For n_clusters =", n_clusters_, "\nThe average silhouette_score is :", SI)
                #print("Calinski-Harabasz Score",  CH, "\nDavies Bouldin score is :", DBI)

                '''
                ### 测试KMeans聚类效果 ###
                #from sklearn.cluster import KMeans

                t1 = time.time()
                km = KMeans(n_clusters=k)
                km.fit(X)
                t2 = time.time()
                #print ("the time of clustering is %.5fs" % (t2 - t1))
                km_labels = km.labels_
                #print("value_counts:\n", pd.DataFrame(pd.value_counts(km_labels)).T)        
                n_clusters_ =len(np.unique(km_labels)) - (1 if -1 in km_labels else 0)
                #pd.DataFrame(km_labels, index=X.index).to_csv(outdir+prefix+"/"+prefix+".KMeans_clustering"+".cluster_labels.k%d.csv" % n_clusters_, header=None, index=True)

                SI = silhouette_score(X, km_labels)
                CH = calinski_harabasz_score(X, km_labels)
                DBI = davies_bouldin_score(X, km_labels)
                score_kmeans["SI"][prefix_index].append(SI)
                score_kmeans["CH"][prefix_index].append(CH)
                score_kmeans["DBI"][prefix_index].append(DBI)
                #print("For n_clusters =", n_clusters_, "\nThe average silhouette_score is :", SI)
                #print("Calinski-Harabasz Score",  CH, "\nDavies Bouldin score is :", DBI)
                '''
            except Exception as e:
                print("\n==============================")
                print("    Error processing %s" % k)
                print("    ", e)
                score_pckmeans["SI"][prefix].append(0)
                score_pckmeans["CH"][prefix].append(0)
                score_pckmeans["DBI"][prefix].append(0)
                '''
                score_kmeans["SI"][prefix_index].append(0)
                score_kmeans["CH"][prefix_index].append(0)
                score_kmeans["DBI"][prefix_index].append(0)
                '''
                print("==============================\n")
                continue
        default_K[prefix] = xticks[prefix][score_pckmeans["SI"][prefix_index].index(np.max(score_pckmeans["SI"][prefix_index]))]
        xticks[prefix] = K_sequence
 
        pd.DataFrame.from_dict({'xticks':xticks[prefix] + [WGCNA_k[prefix]]
                        ,'SI':score_pckmeans["SI"][prefix] + [score_wgcna["SI"][prefix]]
                        ,"CH":score_pckmeans["CH"][prefix] + [score_wgcna["CH"][prefix]]
                        ,"DBI":score_pckmeans["DBI"][prefix] + [score_wgcna["DBI"][prefix]]}
                       ).to_csv(outdir+prefix+"/"+"%s.Kvalue_learn-curve.csv" % prefix, index=False, sep=',')
        
        ### The visualization of the  evaluation score  ###
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
        fig.set_size_inches(18, 6)

        #xticks = range(len(score_pckmeans["SI"][prefix_index]))# K_sequence #

        #ax1.plot(xticks[prefix], score_wgcna["SI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
        ax1.plot(xticks[prefix], score_pckmeans["SI"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
        #ax1.plot(xticks[prefix], score_kmeans["SI"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
        ax1.scatter(WGCNA_k[prefix], score_wgcna["SI"][prefix], s=100, c='r', marker='*',alpha=0.8)
        ax1.set_xlabel('K', fontdict={'size': 16})
        ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
        ax1.set_title("SI", fontdict={'size': 18})
        ax1.legend(loc='best')
        #plt.yticks(range(0, 50, 5)) 
        #plt.grid(True, linestyle='--', alpha=0.5)#添加网格
        #在对应坐标处更换名称
        #ax1.yticks([-2,-1,0,1,2],['really bad','b','c','d','good'])

        #ax2.plot(x, score_wgcna["CH"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
        ax2.plot(xticks[prefix], score_pckmeans["CH"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
        #ax2.plot(xticks[prefix], score_kmeans["CH"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
        ax2.scatter(WGCNA_k[prefix], score_wgcna["CH"][prefix], s=100, c='r', marker='*',alpha=0.8)
        ax2.set_xlabel('K', fontdict={'size': 16})
        ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
        ax2.set_title("CH", fontdict={'size': 18})
        ax2.legend(loc='best')

        #ax3.plot(x, score_wgcna["DBI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
        ax3.plot(xticks[prefix], score_pckmeans["DBI"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
        #ax3.plot(xticks[prefix], score_kmeans["DBI"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
        ax3.scatter(WGCNA_k[prefix], score_wgcna["DBI"][prefix], s=100, c='r', marker='*',alpha=0.8)
        ax3.set_xlabel('K', fontdict={'size': 16})
        ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
        ax3.set_title("DBI", fontdict={'size': 18})
        ax3.legend(loc='best')

        ##plt.suptitle(("Learning curve for K values of ScC-WGCNA and KMeans clustering on %s data" % prefix), fontsize=14, fontweight='bold')
        plt.suptitle(("Learning curve for K values of ScC-WGCNA clustering on %s data" % prefix), fontsize=20, fontweight='bold')
        fig = plt.gcf()
        ##fig.savefig(outdir+prefix+"/"+prefix+".Learning curve for K values of ScC-WGCNA and KMeans clustering on %s data.png" % prefix, dpi=1080)#
        fig.savefig(outdir+prefix+"/"+prefix+".Learning curve for K values of ScC-WGCNA clustering on %s data.pdf" % prefix, dpi=100)#
        fig.savefig(outdir+prefix+"/"+prefix+".Learning curve for K values of ScC-WGCNA clustering on %s data.png" % prefix, dpi=1080)#
        plt.close()
        
    except Exception as e:
        print("\n==============================")
        print("Error processing %s" % prefix)
        print(e)
        print("==============================\n")
        continue
    print("============================\n")
    

print(powers)
print(default_K)

In [None]:
# for prefix_index, prefix in enumerate(cancer_names):
#     prefix = prefix
#     print("%s==========>\n" % prefix)

#     ### The visualization of the  evaluation score  ###
#     fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
#     fig.set_size_inches(18, 6)

#     #xticks = range(len(score_pckmeans["SI"][prefix_index]))# K_sequence #
    
#     #ax1.plot(xticks[prefix], score_wgcna["SI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
#     ax1.plot(xticks[prefix], score_pckmeans["SI"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
#     #ax1.plot(xticks[prefix], score_kmeans["SI"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
#     ax1.scatter(WGCNA_k[prefix], score_wgcna["SI"][prefix], s=100, c='r', marker='*',alpha=0.8)
#     ax1.set_xlabel('K', fontdict={'size': 16})
#     ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
#     ax1.set_title("SI", fontdict={'size': 18})
#     ax1.legend(loc='best')
#     #plt.yticks(range(0, 50, 5)) 
#     #plt.grid(True, linestyle='--', alpha=0.5)#添加网格
#     #在对应坐标处更换名称
#     #ax1.yticks([-2,-1,0,1,2],['really bad','b','c','d','good'])
    
#     #ax2.plot(x, score_wgcna["CH"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
#     ax2.plot(xticks[prefix], score_pckmeans["CH"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
#     #ax2.plot(xticks[prefix], score_kmeans["CH"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
#     ax2.scatter(WGCNA_k[prefix], score_wgcna["CH"][prefix], s=100, c='r', marker='*',alpha=0.8)
#     ax2.set_xlabel('K', fontdict={'size': 16})
#     ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
#     ax2.set_title("CH", fontdict={'size': 18})
#     ax2.legend(loc='best')
    
#     #ax3.plot(x, score_wgcna["DBI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
#     ax3.plot(xticks[prefix], score_pckmeans["DBI"][prefix], color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
#     #ax3.plot(xticks[prefix], score_kmeans["DBI"][prefix], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
#     ax3.scatter(WGCNA_k[prefix], score_wgcna["DBI"][prefix], s=100, c='r', marker='*',alpha=0.8)
#     ax3.set_xlabel('K', fontdict={'size': 16})
#     ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
#     ax3.set_title("DBI", fontdict={'size': 18})
#     ax3.legend(loc='best')
    
#     ##plt.suptitle(("Learning curve for K values of ScC-WGCNA and KMeans clustering on %s data" % prefix), fontsize=14, fontweight='bold')
#     plt.suptitle(("Learning curve for K values of ScC-WGCNA clustering on %s data" % prefix), fontsize=20, fontweight='bold')
#     fig = plt.gcf()
#     ##fig.savefig(outdir+prefix+"/"+prefix+".Learning curve for K values of ScC-WGCNA and KMeans clustering on %s data.png" % prefix, dpi=1080)#
#     fig.savefig(outdir+prefix+"/"+prefix+".Learning curve for K values of ScC-WGCNA clustering on %s data.png" % prefix, dpi=1080)#
#     plt.show()


### 在n个数据集上对m种聚类算法用三种外部聚类评估指标进行对比评估 以学习曲线选取最佳K值设置K 可以设置共表达网络构建所用的距离度量方法

In [None]:
### 在n个数据集上对m种聚类算法用三种外部聚类评估指标进行对比评估 以学习曲线选取最佳K值设置K 可以设置共表达网络构建所用的距离度量方法 ###
### 没有异常处理 ###

score_wgcna = dict(SI = [], CH = [], DBI = [])
score_pckmeans = dict(SI = [], CH = [], DBI = [])
#score_kmeans = dict(SI = [], CH = [], DBI = [])

if distance=="dcorr":
    K = {"BLCA": 5, "BRCA": 5, "COAD": 5, "KIRC": 5, "LUAD": 5, "LUSC": 5, "STAD": 5,  "PAAD": 5} 
elif distance=="pearson":
    K = {"BLCA": 5, "BRCA": 5, "COAD": 5, "KIRC": 5, "LUAD": 5, "LUSC": 5, "STAD": 5,  "PAAD": 5}
else:
    K = WGCNA_k

max_density = []
for prefix in cancer_names:
    #prefix = "KIRC"
    prefix = prefix
    print("###==================== %s ====================###\n" % prefix)
    
    if not os.path.exists(outdir+prefix):
        os.makedirs(outdir+prefix)
        

    data = pd.read_csv(data_indir+prefix+"/"+prefix+".tumor.tpm.updown.feature_selection.csv", sep=',', index_col=0)
    print("Input: data.shape", data.shape)
    print("NAN check for data: ", data.isnull().values.sum())
    '''
    if distance=="Pearson":
        similarity = data.T.corr()#dcor## # 计算相关系数，得到一个矩阵
    elif distance=="snn":   
        snn_sim = snn_sim_matrix(data, k=5)
    elif distance=="dcorr":
        similarity = pd.read_csv(dcorr_dir+prefix+"/"+prefix+".tumor.tpm.updown.feature_selection.dcorr.csv", index=False, sep=',')
    else:
        print("input parameter error：Undefined distance type.")

    sim = pd.DataFrame(similarity, index=data.index, columns=data.index)
    print("NAN check: ", sim.isnull().values.sum())
    #先删除全是空值的行和列
    if sim.isnull().values.ravel().sum():
        sim.drop(sim.columns[sim.isnull().all(axis=1)].tolist(), inplace=True)#删除全是NAN的行
        sim = sim[sim.columns[~sim.isnull().all(axis=0)].tolist()]#删除全是NAN的列
        #再用0填充NAN
        sim.fillna(0, inplace=True)#用0填补缺失值
    print("After processing NA: sim.shape", sim.shape)
    sim.to_csv(outdir+prefix+"/"+prefix+"."+distance+"_sim.csv", index=False, sep=',')

    #Check scale free topology
    # Create an adjacency network； ^6 近似无标度化
    # here we define the adjacency matrix using soft thresholding with beta=6
    ADJ = abs(sim)**powers[prefix] # 近似无标度化
    print("Input: ADJ", "Max: {}".format(ADJ.values.ravel().max()), "Min: {}".format(ADJ.values.ravel().min()), sep="\n")
    print("NAN check: ", ADJ.isnull().values.sum())
    #ADJ.to_csv("./outdir/"+Inputfile+"."+distance+".ADJ.csv", index=False, sep=',')

    values = -np.sort(-(ADJ.sum().values))# 包含每个柱子对应值的序列
    index = sim.index# 包含每个柱子下标的序列    
    draw_bar(values)

    # Compute topological overlap
    #from topological_overlap_measure import TOMsimilarity
    from topological_overlap_measure import *
    #from soothsayer.networks import topological_overlap_measure
    TOM = TOMsimilarity(ADJ)#把邻接矩阵转换为拓扑重叠矩阵topological_overlap_measure
    dissTOM = 1 - TOM

    TOM.to_csv(outdir+prefix+"/"+prefix+"."+distance+".TOM.csv", index=False, sep=',')
    dissTOM.to_csv(outdir+prefix+"/"+prefix+"."+distance+".dissTOM.csv", index=False, sep=',')
    '''
    
    ADJ = pd.read_csv(outdir+prefix+"/"+prefix+"."+distance+".ADJ.csv", sep=',')
    ADJ.index = ADJ.columns
    #计算邻接性
    Connectivity = ADJ.values
    row, col = np.diag_indices_from(Connectivity)
    Connectivity[row, col] = 0
    Connectivity = pd.DataFrame(Connectivity, columns=ADJ.columns)
    Connectivity.to_csv(outdir+prefix+"/"+prefix+"."+distance+".Connectivity.csv", index=False, sep=',')
    
    TOM = pd.read_csv(outdir+prefix+"/"+prefix+"."+distance+"_TOM.csv", sep=',')
    TOM.index = TOM.columns 
    
    dissTOM = pd.read_csv(outdir+prefix+"/"+prefix+"."+distance+".dissTOM.csv", sep=',')
    dissTOM.index = dissTOM.columns
    
    X = dissTOM
    
    try:
        # 使用TSNE进行降维处理。
        tsne = TSNE(n_components=2, learning_rate=100, random_state=0).fit_transform(TOM)
    except Exception as e:
        print("\n==============================")
        print("Error occurs in %s process of %s" % (tsne, prefix))
        print(e)        
        print("==============================\n")
        tsne = TSNE(n_components=2, learning_rate=100, random_state=0).fit_transform(ADJ)
    
    # 读入约束基因集，构建成对约束
    df = open(mldir+prefix+"/"+prefix+".ml.txt",'r')
    geneslist = df.readline().split("\t")
    geneslist.pop()#去掉最后的换行符
    ml = set()
    byt = df.readlines()
    for index,value in enumerate(byt):
        temp = value.split("\t")
        temp.pop()#默认删除最后一个元素，并返回值
        #temp.remove("\n")#删除元素temp
        byt[index] = temp
        for i in range(len(temp)):
            for j in range(i+1, len(temp)):
                ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
    print("Constraints in pairs gene number:%d\n" % len(ml))

    ### 可视化WGCNA聚类结果作为参考 ###
    print("WGCNA==========>")
    wgcna_y_pred = pd.read_csv(labelsdir+prefix+"/"+prefix+".wgcna.result.csv", index_col=False)
    wgcna_y_pred.index = wgcna_y_pred["gene"].values
    wgcna_y_pred = wgcna_y_pred.loc[X.index]
    wgcna_labels = wgcna_y_pred["dynamicMods"].values
    print("value_counts:\n", pd.DataFrame(pd.value_counts(wgcna_labels)).T)
    labels_counts = pd.DataFrame(pd.value_counts(wgcna_labels))
    labels_counts.insert(0, "clusters", labels_counts.index)
    labels_counts.columns = ["clusters", "counts"]
    labels_counts.to_csv(outdir+prefix+"/"+prefix+".WGCNA_clustering"+".labels_counts.csv", index=False)
    pd.DataFrame(np.array([X.index, wgcna_labels]).T, columns=["gene", "labels"]).to_csv(outdir+prefix+"/"+prefix+".WGCNA_clustering"+".cluster_labels.csv", index=False)#, header=None

    unique_labels = np.unique(wgcna_labels)
    ##分类个数：lables中包含-1，表示噪声点
    n_clusters_ =len(np.unique(wgcna_labels)) - (1 if -1 in wgcna_labels else 0) 
    print("Number of clusters with WGCNA:%d" % n_clusters_)
    wgcna_k = n_clusters_

    SI = silhouette_score(X, wgcna_labels)
    CH = calinski_harabasz_score(X, wgcna_labels)
    DBI = davies_bouldin_score(X, wgcna_labels)
    score_wgcna["SI"].append(SI)
    score_wgcna["CH"].append(CH)
    score_wgcna["DBI"].append(DBI)
    print("For n_clusters =", n_clusters_,
            "\nThe average silhouette_score is :", SI)
    print("Calinski-Harabasz Score",  CH,
             "\nDavies Bouldin score is :", DBI)

    sample_silhouette_values = silhouette_samples(X, wgcna_labels)

    plt.clf() # 使用 plt.clf() 清理掉 axes
    # fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    plt.rcParams.update({'font.family': 'Times New Roman'})
    plt.rcParams.update({'font.weight': 'normal'})
    plt.rcParams.update({'font.size': 20})
    
    ax1=fig.add_subplot(121)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, X.shape[0] + (n_clusters_ + 1) * 10])


    y_lower = 10
    for i in range(n_clusters_):
        ith_cluster_silhouette_values = sample_silhouette_values[wgcna_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i)/n_clusters_)
        ax1.fill_betweenx(np.arange(y_lower, y_upper)
                         ,ith_cluster_silhouette_values
                         ,facecolor=color
                         ,alpha=0.7
                         )
        ax1.text(-0.05
                 , y_lower + 0.5 * size_cluster_i
                 , str(i))
        y_lower = y_upper + 10
    ax1.set_title("The Silhouette plot for the various clusters.", fontsize=18)
    ax1.set_xlabel("The Silhouette coefficient values", fontsize=16)
    ax1.set_ylabel("Cluster label", fontsize=16)
    ax1.axvline(x=SI, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    #colors = cm.nipy_spectral(wgcna_labels.astype(float) / n_clusters_)
    colors = Labels2Color(X, wgcna_labels)['colorcode']
    ax2 = fig.add_subplot(1, 2, 2, projection='3d') 
    ax2.scatter3D(tsne[:, 0], tsne[:, 1],tsne[:, 2]
               ,marker='o'
               ,s=4
               ,c=colors
               )

    ax2.set_title("The visualization of the clustered data.", fontsize=18)
    ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
    ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16)
    ax2.set_zlabel("z", fontsize=20)
    plt.suptitle(("Silhouette analysis for WGCNA clustering on %s data with n_clusters = %d" % (prefix, wgcna_k)),
                 fontsize=20, fontweight='bold')
    fig = plt.gcf()
    fig.savefig(outdir+prefix+"/"+prefix+".Silhouette analysis for WGCNA clustering on %s data.png" % prefix, bbox_inches='tight', dpi=1080)
    fig.savefig(outdir+prefix+"/"+prefix+".Silhouette analysis for WGCNA clustering on %s data.pdf" % prefix, bbox_inches='tight', dpi=100)
    #plt.show()

    ### 测试聚类效果 ###
    print("\nPCKMeans==========>")

    t1 = time.time()
    pckm = PCKMeans(n_clusters=K[prefix])#, distance_type='euclidean'
    pckm.fit(np.array(X), ml=ml)#
    t2 = time.time()
    print ("the time of clustering is %.5fs" % (t2 - t1))
    pckm_labels = pckm.labels_ 

    
    print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)
    labels_counts = pd.DataFrame(pd.value_counts(pckm_labels))
    labels_counts.insert(0, "clusters", labels_counts.index)
    labels_counts.columns = ["clusters", "counts"]
    labels_counts.to_csv(outdir+prefix+"/"+prefix+"."+distance+".PCKMeans_clustering"+".labels_counts.csv", index=False)
    pd.DataFrame(np.array([X.index, pckm_labels]).T, columns=["gene", "labels"]).to_csv(outdir+prefix+"/"+prefix+"."+distance+".PCKMeans_clustering"+".cluster_labels.csv", index=False)#, header=None
    unique_labels = np.unique(pckm_labels)
    n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)
 
    #每个模块的基因数条形图
    width = 0.35# 柱子的宽度
    plt.clf() # 使用 plt.clf() 清理掉 axes
    fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.subplot(111)# 创建一个子图
    plt.title("Number of genes in the module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Number of genes", fontsize=16)
    plt.bar(labels_counts.loc[:, "clusters"], labels_counts.loc[:, "counts"], width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Number of genes in the module.png", bbox_inches='tight', dpi=1080)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Number of genes in the module.pdf", bbox_inches='tight', dpi=200)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
    pd.DataFrame.from_dict({"clusters":labels_counts.loc[:, "clusters"], "counts":labels_counts.loc[:, "counts"]}).to_csv(outdir+prefix+"/"+prefix+"."+distance+".Number of genes in the module.csv", index=False, sep=',')
    
    
    #模块内平均连通性
    values = []
    for k in np.sort(np.unique(pckm_labels)):      
        values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/sum(pckm_labels==k))
    width = 0.35# 柱子的宽度
    #fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.clf() # 使用 plt.clf() 清理掉 axes
    plt.subplot(111)# 创建一个子图
    plt.title("Mean Connectivity of module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Mean Connectivity", fontsize=16)
    plt.bar(np.sort(np.unique(pckm_labels)), values, width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Mean_Connectivity_of_module.png", dpi=1080, bbox_inches='tight')
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Mean_Connectivity_of_module.pdf", dpi=200, bbox_inches='tight')
    pd.DataFrame.from_dict({'pckm_labels':np.sort(np.unique(pckm_labels)), 'values':values}).to_csv(outdir+prefix+"/"+prefix+"."+distance+".Mean_Connectivity_of_module.csv", index=False, sep=',')
    
    
    #模块密度
    values = []
    for k in np.sort(np.unique(pckm_labels)):      
        values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/(sum(pckm_labels==k)*(sum(pckm_labels==k)-1)))
    max_density.append(max(values))
    width = 0.35# 柱子的宽度
    #fig = plt.figure(figsize=(6, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.clf() # 使用 plt.clf() 清理掉 axes
    plt.subplot(111)# 创建一个子图
    plt.title("Density of module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Density", fontsize=16)
    plt.bar(np.sort(np.unique(pckm_labels)), values, width = width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".density_of_module.png", dpi=1080, bbox_inches='tight')
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".density_of_module.pdf", dpi=200, bbox_inches='tight')
    pd.DataFrame.from_dict({'pckm_labels':np.sort(np.unique(pckm_labels)), 'values':values}).to_csv(outdir+prefix+"/"+prefix+"."+distance+".density_of_module.csv", index=False, sep=',')
    
    
    # 参数评估
    SI = silhouette_score(X, pckm_labels)
    CH = calinski_harabasz_score(X, pckm_labels)
    DBI = davies_bouldin_score(X, pckm_labels)
    score_pckmeans["SI"].append(SI)
    score_pckmeans["CH"].append(CH)
    score_pckmeans["DBI"].append(DBI)
    print("For n_clusters =", n_clusters_,
            "\nThe average silhouette_score is :", SI)
    print("Calinski-Harabasz Score",  CH,
             "\nDavies Bouldin score is :", DBI)

    sample_silhouette_values = silhouette_samples(X, pckm_labels)


    plt.clf() # 使用 plt.clf() 清理掉 axes
    fig=plt.figure()   
    #fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7) 
    plt.rcParams.update({'font.family': 'Times New Roman'})
    plt.rcParams.update({'font.weight': 'normal'})
    plt.rcParams.update({'font.size': 20})
    
    ax1=fig.add_subplot(121)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, X.shape[0] + (n_clusters_ + 1) * 10])

    y_lower = 10
    for i in range(n_clusters_):
        ith_cluster_silhouette_values = sample_silhouette_values[pckm_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i)/n_clusters_)
        ax1.fill_betweenx(np.arange(y_lower, y_upper)
                         ,ith_cluster_silhouette_values
                         ,facecolor=color
                         ,alpha=0.7
                         )
        ax1.text(-0.05
                 , y_lower + 0.5 * size_cluster_i
                 , str(i))
        y_lower = y_upper + 10
    ax1.set_title("The Silhouette plot for the various clusters.", fontsize=18)
    ax1.set_xlabel("The Silhouette coefficient values", fontsize=16)
    ax1.set_ylabel("Cluster label", fontsize=16)
    ax1.axvline(x=SI, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    #colors = cm.nipy_spectral(pckm_labels.astype(float) / n_clusters_)
    colors = Labels2Color(X, pckm_labels)['colorcode']
    ax2 = fig.add_subplot(1, 2, 2, projection='3d')
    ax2.scatter3D(tsne[:, 0], tsne[:, 1],tsne[:, 2]
               ,marker='o'
               ,s=4
               ,c=colors
               )

    ax2.set_title("The visualization of the clustered data.", fontsize=18)
    ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
    ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16, rotation=38) #, rotation=38 y 轴名称旋转 38 度
    ax2.set_zlabel('Z')
    plt.suptitle(("Silhouette analysis for ScC-WGCNA clustering on %s data with n_clusters = %d" % (prefix, n_clusters_)),
                 fontsize=20, fontweight='bold')
    fig = plt.gcf()
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Silhouette analysis for ScC-WGCNA clustering on %s data.png" % prefix, bbox_inches='tight', dpi=1080)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Silhouette analysis for ScC-WGCNA clustering on %s data.pdf" % prefix, bbox_inches='tight', dpi=200)  # 保存图片，如果不设置 bbox_inches='tight'，保存的图片有可能显示不全
    #plt.show()

    '''
    ### 测试KMeans聚类效果 ###
    #from sklearn.cluster import KMeans

    print("\nKMeans==========>")
    t1 = time.time()
    km = KMeans(n_clusters=K[prefix])
    km.fit(X)
    t2 = time.time()
    print ("the time of clustering is %.5fs" % (t2 - t1))
    km_labels = km.labels_
    print("value_counts:\n", pd.DataFrame(pd.value_counts(km_labels)).T)
    labels_counts = pd.DataFrame(pd.value_counts(km_labels))
    labels_counts.insert(0, "clusters", labels_counts.index)
    labels_counts.columns = ["clusters", "counts"]
    labels_counts.to_csv(outdir+prefix+"/"+prefix+".KMeans_clustering"+".labels_counts.csv", index=False)
    pd.DataFrame(np.array([X.index, km_labels]).T, columns=["gene", "labels"]).to_csv(outdir+prefix+"/"+prefix+".KMeans_clustering"+".cluster_labels.csv", index=False)#, header=None
    unique_labels = np.unique(km_labels)
    n_clusters_ =len(np.unique(km_labels)) - (1 if -1 in km_labels else 0)

    SI = silhouette_score(X, km_labels)
    CH = calinski_harabasz_score(X, km_labels)
    DBI = davies_bouldin_score(X, km_labels)
    score_kmeans["SI"].append(SI)
    score_kmeans["CH"].append(CH)
    score_kmeans["DBI"].append(DBI)
    print("For n_clusters =", n_clusters_,
            "\nThe average silhouette_score is :", SI)
    print("Calinski-Harabasz Score",  CH,
             "\nDavies Bouldin score is :", DBI)


    sample_silhouette_values = silhouette_samples(X, km_labels)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, X.shape[0] + (n_clusters_ + 1) * 10])

    y_lower = 10
    for i in range(n_clusters_):
        ith_cluster_silhouette_values = sample_silhouette_values[km_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i)/n_clusters_)
        ax1.fill_betweenx(np.arange(y_lower, y_upper)
                         ,ith_cluster_silhouette_values
                         ,facecolor=color
                         ,alpha=0.7
                         )
        ax1.text(-0.05
                 , y_lower + 0.5 * size_cluster_i
                 , str(i))
        y_lower = y_upper + 10
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=SI, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    #colors = cm.nipy_spectral(pckm_labels.astype(float) / n_clusters_)
    colors = Labels2Color(X, km_labels)['colorcode']
    ax2.scatter(tsne[:, 0], tsne[:, 1]
               ,marker='o'
               ,s=6
               ,c=colors
               )

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    plt.suptitle(("Silhouette analysis for KMeans clustering on %s data with n_clusters = %d" % (prefix, n_clusters_)),
                 fontsize=14, fontweight='bold')
    fig = plt.gcf()
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".Silhouette analysis for KMeans clustering on %s data.png" % prefix, dpi=1080)#
    #plt.show()
    '''

    ### The visualization of the clustered data ###
    plt.clf() # 使用 plt.clf() 清理掉 axes
    # set up a figure twice as wide as it is tall
    #fig = plt.figure(figsize=plt.figaspect(0.5))
    #fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
    #fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 6)

    colors1 = Labels2Color(X, wgcna_labels)['colorcode']
    # set up the axes for the first plot
    ax1 = fig.add_subplot(1, 2, 1, projection='3d')
    ax1.scatter3D(tsne[:, 0], tsne[:, 1], tsne[:, 2]
               ,marker='o'
               ,s=4
               ,c=colors1
               )
    ax1.set_title("WGCNA with n_clusters = %d" % wgcna_k, fontsize=18)
    ax1.set_xlabel("Feature space for the 1st feature", fontsize=16)
    ax1.set_ylabel("Feature space for the 2nd feature", fontsize=16)
    ax1.set_zlabel("z", fontsize=16)
    
    colors2 = Labels2Color(X, pckm_labels)['colorcode']
    # set up the axes for the first plot
    ax2 = fig.add_subplot(1, 2, 2, projection='3d')
    ax2.scatter3D(tsne[:, 0], tsne[:, 1], tsne[:, 2]
               ,marker='o'
               ,s=4
               ,c=colors2
               )
    ax2.set_title("PCKMeans with n_clusters = %d" % K[prefix], fontsize=18)
    ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
    ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16)
    ax2.set_zlabel("z", fontsize=16)

    '''
    colors3 = Labels2Color(X, km_labels)['colorcode']
    ax3.scatter(tsne[:, 0], tsne[:, 1]
               ,marker='o'
               ,s=6
               ,c=colors3
               )
    ax3.set_title("KMeans with n_clusters = %d" % K[prefix])
    ax3.set_xlabel("Feature space for the 1st feature")
    ax3.set_ylabel("Feature space for the 2nd feature")
    '''
    plt.suptitle(("The visualization of %s data clustering results" % prefix),
                 fontsize=20, fontweight='bold')
    fig = plt.gcf()
    #fig.savefig(outdir+prefix+"/"+prefix+".the visualization of %s data with WGCNA, ScC-WGCNA and KMeans clustering results.png" % prefix, dpi=1080)#
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.png" % prefix, bbox_inches='tight', dpi=1080)
    fig.savefig(outdir+prefix+"/"+prefix+"."+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.pdf" % prefix, bbox_inches='tight', dpi=100)
    plt.show()
    
    print("================================================\n")

### The visualization of the  evaluation score  ###
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(24, 8)

x = cancer_names #range(len(score_wgcna["SI"]))# 
ax1.plot(x, score_wgcna["SI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax1.plot(x, score_pckmeans["SI"], color = 'green', linewidth = 1.0, linestyle = '--', label='PCKMeans', marker='^', markerfacecolor='green', markersize=10)
#ax1.plot(x, score_kmeans["SI"], color = 'blue', linewidth = 1.0, linestyle = '-.', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
ax1.set_xlabel('datasets', fontdict={'size': 16})
ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
ax1.set_title("SI", fontdict={'size': 18})
ax1.legend(loc='best')

ax2.plot(x, score_wgcna["CH"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax2.plot(x, score_pckmeans["CH"], color = 'green', linewidth = 1.0, linestyle = '--', label='PCKMeans', marker='^', markerfacecolor='green', markersize=10)
#ax2.plot(x, score_kmeans["CH"], color = 'blue', linewidth = 1.0, linestyle = '--', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
ax2.set_xlabel('datasets', fontdict={'size': 16})
ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
ax2.set_title("CH", fontdict={'size': 18})
ax2.legend(loc='best')

ax3.plot(x, score_wgcna["DBI"], color = 'red', linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax3.plot(x, score_pckmeans["DBI"], color = 'green', linewidth = 1.0, linestyle = '--', label='PCKMeans', marker='^', markerfacecolor='green', markersize=10)
#ax3.plot(x, score_kmeans["DBI"], color = 'blue', linewidth = 1.0, linestyle = '--', label='KMeans', marker='o', markerfacecolor='blue', markersize=10)
ax3.set_xlabel('datasets', fontdict={'size': 16})
ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
ax3.set_title("DBI", fontdict={'size': 18})
ax3.legend(loc='best')

#plt.suptitle(("SI,CH and DBI scores for WGCNA, ScC-WGCNA and KMeans clustering on sample data"), fontsize=14, fontweight='bold')
plt.suptitle(("SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data"), fontsize=20, fontweight='bold')
fig = plt.gcf()
#fig.savefig(outdir+"SI,CH and DBI scores for WGCNA, ScC-WGCNA and KMeans clustering on sample data.png", dpi=1080)#
fig.savefig(outdir+distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data.png", bbox_inches='tight', dpi=1080)
fig.savefig(outdir+distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on sample data.pdf", bbox_inches='tight', dpi=100)
plt.show()


#保存各数据集上最大模块密度
with open(outdir+distance+".max_density.txt", "w") as OUT:
    for temp in max_density:
        OUT.write(str(temp) + "\t")
    OUT.write("\n")

In [None]:
#画wgcna的基因数量图、模块平均连通性图和模块密度图

labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir/"
data_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir_updown/pearson/"
outdir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir_updown/wgcna/"

cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#

# score_wgcna = dict(SI = [], CH = [], DBI = [])
# score_pckmeans = dict(SI = [], CH = [], DBI = [])
# #score_kmeans = dict(SI = [], CH = [], DBI = [])
# distance = "pearson" #"dcorr" #
# datdir = data_dir+distance+"/"

max_density = []
fig = plt.figure(figsize=(6, 6), dpi=100)# 创建画布, 并设置分辨率为 80像素/每英寸
for prefix in cancer_names:
    #prefix = "OV"
    prefix = prefix
    print("###==================== %s ====================###\n" % prefix)

    if not os.path.exists(outdir+prefix):
        os.makedirs(outdir+prefix)

    wgcna_y_pred = pd.read_csv(labelsdir+prefix+"/"+prefix+".wgcna.result.csv", index_col=False)#KIRC.wgcna.result
    wgcna_labels = wgcna_y_pred["dynamicMods"].values
    print("value_counts:\n", pd.DataFrame(pd.value_counts(wgcna_labels)).T)
    n_clusters_ =len(np.unique(wgcna_labels)) - (1 if -1 in wgcna_labels else 0) 
    labels_counts = pd.DataFrame(pd.value_counts(wgcna_labels))
    labels_counts.insert(0, "clusters", labels_counts.index)
    labels_counts.columns = ["clusters", "counts"]

    ADJ = pd.read_csv(datdir+prefix+"/"+prefix+"."+distance+".ADJ.csv", sep=',')
    ADJ.index = ADJ.columns
    #计算邻接性
    Connectivity = ADJ.values
    row, col = np.diag_indices_from(Connectivity)
    Connectivity[row, col] = 0
    Connectivity = pd.DataFrame(Connectivity)


    #每个模块的基因数条形图
    width = 0.35# 柱子的宽度
    #fig = plt.figure(figsize=(6, 6), dpi=80)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.clf() # 使用 plt.clf() 清理掉 axes
    plt.subplot(111)# 创建一个子图
    #plt.title("Number of genes in the module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Number of genes", fontsize=16)
    plt.bar(labels_counts.loc[:, "clusters"], labels_counts.loc[:, "counts"], width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色
    fig.savefig(outdir+prefix+"/"+prefix+".wgcna.Number of genes in the module.png", dpi=1080, bbox_inches='tight')  



    #模块内平均连通性
    values = []
    for k in np.sort(np.unique(wgcna_labels)):      
        values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/sum(wgcna_labels==k))
    width = 0.35# 柱子的宽度
    #fig = plt.figure(figsize=(6, 6), dpi=80)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.clf() # 使用 plt.clf() 清理掉 axes
    plt.subplot(111)# 创建一个子图
    plt.title("Mean Connectivity of module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Mean Connectivity", fontsize=16)
    plt.bar(np.sort(np.unique(wgcna_labels)), values, width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
    fig.savefig(outdir+prefix+"/"+prefix+".wgcna.Mean_Connectivity_of_module.png", dpi=1080, bbox_inches='tight')


    #模块密度
    values = []
    for k in np.sort(np.unique(wgcna_labels)):      
        values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/(sum(wgcna_labels==k)*(sum(wgcna_labels==k)-1)))
    max_density.append(max(values))
    width = 0.35# 柱子的宽度
    #fig = plt.figure(figsize=(6, 6), dpi=80)# 创建画布, 并设置分辨率为 80像素/每英寸
    plt.clf() # 使用 plt.clf() 清理掉 axes
    plt.subplot(111)# 创建一个子图
    #plt.title("Density of module", fontsize=18)
    plt.xlabel("Module", fontsize=16)
    plt.ylabel("Density of module", fontsize=16)
    plt.bar(np.sort(np.unique(wgcna_labels)), values, width, color="#87CEFA") # 绘制柱状图, 每根柱子的颜色为紫罗兰色ind-width/2
    fig.savefig(outdir+prefix+"/"+prefix+".wgcna.density_of_module.png", dpi=1080, bbox_inches='tight')

#保存各数据集上最大模块密度
with open(outdir+"wgcna.max_density.txt", "w") as OUT:
    for temp in max_density:
        OUT.write(str(temp) + "\t")
    OUT.write("\n")
