In [None]:
# -*- coding: UTF-8 -*-
import os,re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.rcParams['axes.unicode_minus'] = False#使用上标小标小一字号
plt.rcParams['font.sans-serif']=['Times New Roman']
#设置全局字体，可选择需要的字体替换掉‘Times New Roman’
#使用黑体'SimHei'作为全局字体，可以显示中文
#plt.rcParams['font.sans-serif']=['SimHei']

import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score # CH 指标，DBI 
from sklearn.metrics import silhouette_samples, silhouette_score # 轮廓系数
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cdist#cdist(XA, XB, 'correlation')
from dcor import distance_correlation as dcorr
from scipy.stats import pearsonr
#    1）输入：x为特征，y为目标变量.
#    2）输出：r： 相关系数 [-1，1]之间，p-value: p值。
#         注： p值越小，表示相关系数越显著，一般p值在500个样本以上时有较高的可靠性。
import time
import warnings
warnings.filterwarnings("ignore")
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')#日志只显示error级别的错误

from pckmeans import PCKMeans
from mpckmeans import MPCKMeans

##my
from labels2color import Labels2Color
#from topological_overlap_measure import TOMsimilarity
from topological_overlap_measure import *
#from soothsayer.networks import topological_overlap_measure

from visualization import *
from my_functions import *

# cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#
mldir = "E:/Project/Project001 WGCNA/main/step-6-FactorAnalyzer/outdir"
data_indir = "E:/Project/Project001 WGCNA/main/step-3-FeatureSelection/outdir"
labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir"
out_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir"
dcorr_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/Calculate the correlation coefficient/outdir"

distance = "dcorr" #"pearson" #
power = 20
filename = 'BLCA.tcga_gtex.tpm.updown.feature_selection.csv'
# filename = 'BLCA.tumor.tpm.updown.feature_selection.csv'
prefix = filename.split('.')[0]

mlDir = '/'.join([mldir, filename.split('.')[1]])
labelsDir = '/'.join([labelsdir, filename.split('.')[1]])
# outdir = os.path.join(outdir, filename.split('.'))
outdir = '/'.join([out_dir, filename.split('.')[1], distance])
dcorrDir = '/'.join([dcorr_dir, filename.split('.')[1]])

if not os.path.exists(os.path.join(outdir, prefix)):
    os.makedirs(os.path.join(outdir, prefix))
    
assess = dict(wgcna = dict(SI = 0, CH = 0, DBI = 0, module_density_max = 0)
              , pckmeans = dict(SI = 0, CH = 0, DBI = 0, module_density_max = 0))

In [None]:
##########################################
### 构建GCN网络计算dissTOM矩阵 ###
##########################################
datExpr = pd.read_csv(os.path.join(data_indir, prefix, filename), sep=',', index_col=0)
if distance == "pearson":
    print("Input: datExpr.shape", datExpr.shape)
    print("NAN check for datExpr: ", datExpr.isnull().values.sum())
    sim = datExpr.T.corr()#dcor## # 计算pearson相关系数，得到相似矩阵
elif distance == "snn":
    print("Input: datExpr.shape", datExpr.shape)
    print("NAN check for datExpr: ", datExpr.isnull().values.sum())
    sim = snn_sim_matrix(datExpr, k=5)
elif distance == "dcorr":
    sim = pd.read_csv(os.path.join(dcorrDir, prefix, filename.rstrip('.csv')+".dcorr.csv"), sep=',')
    sim.index = sim.columns
else:
    print("input parameter error：Undefined distance type.")

print("NAN check: ", sim.isnull().values.sum())
#先删除全是空值的行和列
if sim.isnull().values.ravel().sum():
    sim.drop(sim.columns[sim.isnull().all(axis=1)].tolist(), inplace=True)#删除全是NAN的行
    sim = sim[sim.columns[~sim.isnull().all(axis=0)].tolist()]#删除全是NAN的列
    #再用0填充NAN
    sim.fillna(0, inplace=True)#用0填补缺失值
print("After processing NA: sim.shape", sim.shape)
sim.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".sim.csv"), index=False, sep=',')


# 调用R script计算软阈值
sft = pickSoftThreshold(sim, power=power, RsquaredCut = 0.75, prefix=prefix, outdir=outdir)
print('powerEstimate:%s' % sft["powerEstimate"])
sft["fitIndices"].to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+"sft_fitIndices.csv"), index=False, sep=',')


#Check scale free topology
# Create an adjacency network； ^6 近似无标度化
# here we define the adjacency matrix using soft thresholding with beta=6
ADJ = abs(sim)**sft["powerEstimate"] # 近似无标度化
print("Input: ADJ", "Max: {}".format(ADJ.values.ravel().max()), "Min: {}".format(ADJ.values.ravel().min()), sep="\n")
print("NAN check: ", ADJ.isnull().values.sum())
ADJ.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".ADJ.csv"), index=False, sep=',')

#计算邻接性
Connectivity = ADJ.values
row, col = np.diag_indices_from(Connectivity)
Connectivity[row, col] = 0
Connectivity = pd.DataFrame(Connectivity, columns=ADJ.columns)
Connectivity.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Connectivity.csv"), index=False, sep=',')

# Compute topological overlap
#from topological_overlap_measure import TOMsimilarity
from topological_overlap_measure import *
#from soothsayer.networks import topological_overlap_measure
TOM = TOMsimilarity(ADJ)#把邻接矩阵转换为拓扑重叠矩阵topological_overlap_measure
dissTOM = 1 - TOM

TOM.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".TOM.csv"), index=False, sep=',')
dissTOM.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".dissTOM.csv"), index=False, sep=',')

X = dissTOM.copy()


try:
    import umap.umap_ as umap
    from sklearn.manifold import TSNE
    X_Dim = umap.UMAP(n_components=3).fit_transform(datExpr)
    # 使用TSNE进行降维处理
    # X_Dim = TSNE(n_components=3, learning_rate=100, random_state=0).fit_transform(datExpr) #
    
except Exception as e:
    print("\n==============================")
    print("datExpr dimension reduction failed.\n")
    print(e)
    print("==============================\n")
    X_Dim = TSNE(n_components=3, learning_rate=100, random_state=0).fit_transform(datExpr)
    # X_Dim = umap.UMAP(n_components=3).fit_transform(datExpr)


In [None]:
##########################################
### 可视化WGCNA聚类结果作为参考 ###
##########################################
print("WGCNA==========>")
method = 'WGCNA'
wgcna_y_pred = pd.read_csv(os.path.join(labelsDir, prefix, prefix+".wgcna.result.csv"), index_col=False)
wgcna_y_pred.index = wgcna_y_pred["gene"].values
wgcna_y_pred = wgcna_y_pred.loc[X.index]
wgcna_labels = wgcna_y_pred["dynamicMods"].values
#print("value_counts:\n", pd.DataFrame(pd.value_counts(wgcna_labels)).T)
labels_counts = pd.DataFrame(pd.value_counts(wgcna_labels, sort=False, ascending=False, normalize=False))
labels_counts.insert(0, "clusters", labels_counts.index)
labels_counts.columns = ["clusters", "counts"]
labels_counts.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".%s_clustering.labels_counts.csv" % method), index=False)
pd.DataFrame(np.array([X.index, wgcna_labels]).T
             , columns=["gene", "labels"]
            ).to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".%s_clustering.cluster_labels.csv" % method), index=False)#, header=None

unique_labels = np.unique(wgcna_labels)
##分类个数：lables中包含-1，表示噪声点
n_clusters_ =len(np.unique(wgcna_labels)) - (1 if -1 in wgcna_labels else 0) 
WGCNA_K = n_clusters_

SI = silhouette_score(X, wgcna_labels)
CH = calinski_harabasz_score(X, wgcna_labels)
DBI = davies_bouldin_score(X, wgcna_labels)
sample_silhouette_values = silhouette_samples(X, wgcna_labels)

assess['wgcna']["SI"] = SI
assess['wgcna']["CH"] = CH
assess['wgcna']["DBI"] = DBI
print("For n_clusters =", n_clusters_,
        "\nThe average silhouette_score is :", SI)
print("Calinski-Harabasz Score",  CH,
         "\nDavies Bouldin score is :", DBI)

colors = Labels2Color(X, wgcna_labels)['colorcode']
SilhouetteAnalysis(X_Dim=X_Dim
                   , labels=wgcna_labels
                   , SI=SI
                   , sample_silhouette_values=sample_silhouette_values
                   , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('.csv')+".Silhouette analysis for WGCNA clustering on %s data" % prefix)
                   , suptitle="Silhouette analysis for WGCNA clustering on %s data with n_clusters = %d" % (prefix, WGCNA_K)
                   , colors=colors
                   , D3=False
                   , showplot=True
                  )

###画wgcna的基因数量图、模块平均连通性图和模块密度图###

#每个模块的基因数条形图
bar(x=labels_counts.loc[:, "clusters"], y=labels_counts.loc[:, "counts"]
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Number of genes in the module")
    # , title=" "
    , xlabel="Module"
    , ylabel="Number of genes")
genes_count = pd.DataFrame.from_dict({"Module":labels_counts.loc[:, "clusters"]
                        , "Number of genes":labels_counts.loc[:, "counts"]
                       })
# genes_count.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Number of genes in the module.csv"), index=False, sep=',')

#模块内平均连通性
values = []
for k in np.sort(np.unique(wgcna_labels)):      
    values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/sum(wgcna_labels==k))
bar(x=np.sort(np.unique(wgcna_labels)), y=values
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Mean_Connectivity_of_module")
    # , title="Mean Connectivity of module"
    , xlabel="Module"
    , ylabel="Mean Connectivity")
Mean_Connectivity = pd.DataFrame.from_dict({'Module': np.sort(np.unique(wgcna_labels))
                        , 'Mean Connectivity':values})
# Mean_Connectivity.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.Mean_Connectivity_of_module.csv")
#                                                    , index=False, sep=',')

#模块密度
values = []
for k in np.sort(np.unique(wgcna_labels)):      
    values.append(sum(Connectivity.loc[wgcna_labels==k, wgcna_labels==k].sum())/(sum(wgcna_labels==k)*(sum(wgcna_labels==k)-1)))
assess['wgcna']['module_density_max'] = max(values)
bar(x=np.sort(np.unique(wgcna_labels)), y=values
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.density_of_module")
    # , title=" "
    , xlabel="Module"
    , ylabel="Density of module")
Density = pd.DataFrame.from_dict({'Module':np.sort(np.unique(wgcna_labels)), 'Density':values})
# Density.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.density_of_module.csv"), index=False, sep=',')

from functools import reduce
summary = reduce(lambda left,right: pd.merge(left, right, on=['Module'], how='outer'), [genes_count, Mean_Connectivity, Density])
summary.sort_values(by='Module', axis=0, ascending=True, inplace=True, ignore_index=True)
summary.to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".wgcna.summary_of_modules.csv"), index=False, sep=',')

In [None]:
##########################################
###计算PCKMeans参数K的学习曲线###
##########################################
print("Calculate the learning curve for K ==========>")

score = {'SI': [], 'CH': [], 'DBI': []}

# 读入约束基因集，构建成对约束
df = open(os.path.join(mlDir, prefix, prefix+".ml.txt"),'r')
geneslist = df.readline().split("\t")
geneslist.pop()#去掉最后的换行符
ml = set()
byt = df.readlines()
df.close()
for index,value in enumerate(byt):
    temp = value.split("\t")
    temp.pop()#默认删除最后一个元素，并返回值
    #temp.remove("\n")#删除元素temp
    byt[index] = temp
    for i in range(len(temp)):
        for j in range(i+1, len(temp)):
            ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
print("Number of pairwise constraints:%d\n" % len(ml))

K_sequence = range(2, math.ceil(WGCNA_K*1.2), 1) #range(6, 11, 1)
print("K = ", list(K_sequence))
for k in K_sequence:
    print("---->K = %d" % k)
    try:
        ### 测试PCKMeans聚类效果 ###
        t1 = time.time()
        pckm = PCKMeans(n_clusters=k) #, distance_type='euclidean'
        pckm.fit(np.array(X), ml=ml)
        t2 = time.time()
        #print ("the time of clustering is %.5fs" % (t2 - t1))
        pckm_labels = pckm.labels_ 
        #print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)     
        n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)
        
        SI = silhouette_score(X, pckm_labels)
        CH = calinski_harabasz_score(X, pckm_labels)
        DBI = davies_bouldin_score(X, pckm_labels)
        score["SI"].append(SI)
        score["CH"].append(CH)
        score["DBI"].append(DBI)
        print("The average silhouette_score is :%.4f" % SI)
        # print("For n_clusters =", n_clusters_, "\nThe average silhouette_score is :", SI)
        # print("Calinski-Harabasz Score",  CH, "\nDavies Bouldin score is :", DBI)

    except Exception as e:
        print("\n==============================")
        print("    Error processing %s" % k)
        print("    ", e)
        score["SI"].append(0)
        score["CH"].append(0)
        score["DBI"].append(0)

        print("==============================\n")
        continue

default_K = K_sequence[score["SI"].index(np.max(score["SI"]))]
print('The optimal K value:%d' % default_K)
xticks = list(K_sequence)

pd.DataFrame.from_dict({'K_sequence':list(K_sequence) + [WGCNA_K]
                        ,'SI':score["SI"] + [assess['wgcna']["SI"]]
                        ,"CH":score["CH"] + [assess['wgcna']["CH"]]
                        ,"DBI":score["DBI"] + [assess['wgcna']["DBI"]]
                        ,'type':['pckmeans']*len(K_sequence)+['wgcna']}
               ).to_csv(os.path.join(outdir, prefix, "%s.K_learn-curve.csv" % filename.rstrip('.csv')), index=False, sep=',')



###################################################
### Visualization of the learning curve  ###
###################################################
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(18, 6)

#xticks = range(len(score_pckmeans["SI"][prefix_index]))# K_sequence #

ax1.plot(xticks, score["SI"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax1.scatter(WGCNA_K, assess['wgcna']["SI"], s=100, c='r', marker='*',alpha=0.8)
ax1.set_xlabel('K', fontdict={'size': 16})
ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
ax1.set_title("SI", fontdict={'size': 18})
ax1.legend(loc='best')

ax2.plot(xticks, score["CH"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax2.scatter(WGCNA_K, assess['wgcna']["CH"], s=100, c='r', marker='*',alpha=0.8)
ax2.set_xlabel('K', fontdict={'size': 16})
ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
ax2.set_title("CH", fontdict={'size': 18})
ax2.legend(loc='best')

ax3.plot(xticks, score["DBI"]
         , color = 'green', linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax3.scatter(WGCNA_K, assess['wgcna']["DBI"], s=100, c='r', marker='*',alpha=0.8)
ax3.set_xlabel('K', fontdict={'size': 16})
ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
ax3.set_title("DBI", fontdict={'size': 18})
ax3.legend(loc='best')

plt.suptitle(("Learning curve for K values of ScC-WGCNA clustering on %s data" % prefix), fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, prefix, "%s.Learning curve for K values of ScC-WGCNA clustering on %s data.pdf" % (filename.rstrip('.csv'), filename.split('.')[0])), bbox_inches='tight')#
fig.savefig(os.path.join(outdir, prefix, "%s.Learning curve for K values of ScC-WGCNA clustering on %s data.png" % (filename.rstrip('.csv'), filename.split('.')[0])), dpi=1080, bbox_inches='tight')#
plt.close()



In [None]:
##############################################################################################################################
### 对m种聚类算法用三种外部聚类评估指标进行对比评估 以学习曲线选取最佳K值设置K 可以设置共表达网络构建所用的距离度量方法 ###
##############################################################################################################################
print("PCKMeans==========>")

K = WGCNA_K # default_K if default_K>3 else K_sequence[score["SI"].index(np.max(score["SI"][1:]))]

# 读入约束基因集，构建成对约束
df = open(os.path.join(mlDir, prefix, prefix+".ml.txt"),'r')
geneslist = df.readline().split("\t")
geneslist.pop()#去掉最后的换行符
ml = set()
byt = df.readlines()
for index,value in enumerate(byt):
    temp = value.split("\t")
    temp.pop()#默认删除最后一个元素，并返回值
    #temp.remove("\n")#删除元素temp
    byt[index] = temp
    for i in range(len(temp)):
        for j in range(i+1, len(temp)):
            ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
print("Number of pairwise constraints:%d\n" % len(ml))


### PCKMeans聚类 ###

t1 = time.time()
pckm = PCKMeans(n_clusters=K)#, distance_type='euclidean'
pckm.fit(np.array(X), ml=ml)#
t2 = time.time()
print ("the time of clustering is %.5fs" % (t2 - t1))
pckm_labels = pckm.labels_


print("value_counts:\n", pd.DataFrame(pd.value_counts(pckm_labels)).T)
labels_counts = pd.DataFrame(pd.value_counts(pckm_labels, sort=False, ascending=False, normalize=False))
labels_counts.insert(0, "clusters", labels_counts.index)
labels_counts.columns = ["clusters", "counts"]
labels_counts.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".PCKMeans_clustering.labels_counts.csv"), index=False)
pd.DataFrame(np.array([X.index, pckm_labels]).T, columns=["gene", "labels"]).to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".PCKMeans_clustering.cluster_labels.csv"), index=False)#, header=None


unique_labels = np.unique(pckm_labels)
n_clusters_ =len(np.unique(pckm_labels)) - (1 if -1 in pckm_labels else 0)


# 参数评估
SI = silhouette_score(X, pckm_labels)
CH = calinski_harabasz_score(X, pckm_labels)
DBI = davies_bouldin_score(X, pckm_labels)
assess['pckmeans']["SI"] = SI
assess['pckmeans']["CH"] = CH
assess['pckmeans']["DBI"] = DBI
print("For n_clusters =", n_clusters_,
        "\nThe average silhouette_score is :", SI)
print("Calinski-Harabasz Score",  CH,
         "\nDavies Bouldin score is :", DBI)

sample_silhouette_values = silhouette_samples(X, pckm_labels)

# Silhouette analysis for clustering
colors = Labels2Color(X, pckm_labels)['colorcode']
SilhouetteAnalysis(X_Dim=X_Dim
                   , labels=pckm_labels
                   , SI=SI
                   , sample_silhouette_values=sample_silhouette_values
                   , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Silhouette analysis for ScC-WGCNA clustering on %s data" % prefix)
                   , suptitle="Silhouette analysis for ScC-WGCNA clustering on %s data with n_clusters = %d" % (prefix, n_clusters_)
                   , colors=colors
                   , showplot=True
                   , D3=False
                  )

pd.DataFrame.from_dict(assess).to_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".ScC-WGCNA_vs_WGCNA.assess.csv"), sep=',',index=True)


#每个模块的基因数条形图
bar(x=labels_counts.loc[:, "clusters"], y=labels_counts.loc[:, "counts"]
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Number of genes in the module")
    # , title="Number of genes in the module"
    , xlabel="Module"
    , ylabel="Number of genes")

genes_count = pd.DataFrame.from_dict({"Module":labels_counts.loc[:, "clusters"]
                        , "Number of genes":labels_counts.loc[:, "counts"]})
# genes_count.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Number of genes in the module.csv")
#                                                                            , index=False, sep=',')

#模块内平均连通性
values = []
for k in np.sort(np.unique(pckm_labels)):      
    values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/sum(pckm_labels==k))
bar(x=np.sort(np.unique(pckm_labels)), y=values
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Mean_Connectivity_of_module")
    # , title="Mean Connectivity of module"
    , xlabel="Module"
    , ylabel="Mean Connectivity")

Mean_Connectivity = pd.DataFrame.from_dict({'Module':np.sort(np.unique(pckm_labels)), 'Mean Connectivity':values})
# Mean_Connectivity.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".Mean_Connectivity_of_module.csv")
#                                                    , index=False, sep=',')

#模块密度
values = []
for k in np.sort(np.unique(pckm_labels)):
    values.append(sum(Connectivity.loc[pckm_labels==k, pckm_labels==k].sum())/(sum(pckm_labels==k)*(sum(pckm_labels==k)-1)))
assess['pckmeans']['module_density_max'] = max(values)
bar(x=np.sort(np.unique(pckm_labels)), y=values
    , dirPrefix=os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".density_of_module")
    # , title="Density of module"
    , xlabel="Module"
    , ylabel="Density")

Density = pd.DataFrame.from_dict({'Module':np.sort(np.unique(pckm_labels)), 'Density':values})
# Density.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".density_of_module.csv")
#                                                    , index=False, sep=',')

from functools import reduce
summary = reduce(lambda left,right: pd.merge(left, right, on=['Module'], how='outer'), [genes_count, Mean_Connectivity, Density])
summary.sort_values(by='Module', axis=0, ascending=True, inplace=True, ignore_index=True)
summary.to_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".summary_of_modules.csv"), index=False, sep=',')

In [None]:
###########################################################
### The visualization of the clustered data ###
###########################################################

# datExpr = pd.read_csv(os.path.join(data_indir, prefix, filename), sep=',', index_col=0)

# import umap.umap_ as umap
# X_Dim = umap.UMAP(n_components=3).fit_transform(datExpr)

# pckm_labels = pd.read_csv(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".PCKMeans_clustering.cluster_labels.csv"))['labels'].tolist()
# wgcna_labels = pd.read_csv(os.path.join(outdir, prefix, filename.rstrip('.csv')+".WGCNA_clustering.cluster_labels.csv"))['labels'].tolist()

# WGCNA_K = len(np.unique(wgcna_labels))
# K = WGCNA_K
# WGCNA_K

plt.clf() # 使用 plt.clf() 清理掉 axes
# set up a figure twice as wide as it is tall
fig = plt.figure(figsize=(16, 6), dpi=200)# 创建画布, 并设置分辨率为 80像素/每英寸
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7) 
plt.rcParams.update({'font.family': 'Times New Roman'})
plt.rcParams.update({'font.weight': 'normal'})
plt.rcParams.update({'font.size': 20})

colors1 = Labels2Color(X, wgcna_labels)['colorcode']
# set up the axes for the first plot
ax1.axis('off')
ax1 = fig.add_subplot(1, 2, 1) #, projection='3d'
ax1.scatter(X_Dim[:, 0], X_Dim[:, 1]#3D, X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors1
           )
ax1.set_title("WGCNA with n_clusters = %d" % WGCNA_K, fontsize=18)
ax1.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax1.set_ylabel("Feature space for the 2nd feature", fontsize=16)
# ax1.set_zlabel("z", fontsize=16)

# ax1.spines['top'].set_visible(False)
# ax1.spines['right'].set_visible(False)
# ax1.spines['bottom'].set_visible(False)
# ax1.spines['left'].set_visible(False)

colors2 = Labels2Color(X, pckm_labels)['colorcode']
# set up the axes for the first plot
ax2.axis('off')
ax2 = fig.add_subplot(1, 2, 2)#, projection='3d'

ax2.scatter(X_Dim[:, 0], X_Dim[:, 1]#3D, X_Dim[:, 2]
           ,marker='o'
           ,s=4
           ,c=colors2
           )
ax2.set_title("ScC-WGCNA with n_clusters = %d" % K, fontsize=18)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=16)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=16)
# ax2.set_zlabel("z", fontsize=16)

# ax2.spines['top'].set_visible(False)
# ax2.spines['right'].set_visible(False)
# ax2.spines['bottom'].set_visible(False)
# ax2.spines['left'].set_visible(False)
plt.suptitle(("The visualization of %s data clustering results" % prefix),fontsize=20, fontweight='bold')

fig = plt.gcf()
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.png" % prefix)
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, prefix, filename.rstrip('csv')+distance+".the visualization of %s data with WGCNA and ScC-WGCNA clustering results.pdf" % prefix)
            , bbox_inches='tight')
# plt.show()
plt.close()

### summary

In [None]:
mldir = "E:/Project/Project001 WGCNA/main/step-6-FactorAnalyzer/outdir"
# data_indir = "E:/Project/Project001 WGCNA/main/step-3-FeatureSelection/outdir"
# labelsdir = "E:/Project/Project001 WGCNA/main/step-4-wgcna/outdir"
out_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/semi_supervised Kmeans/outdir"
# dcorr_dir = "E:/Project/Project001 WGCNA/main/step-7-clustering/Calculate the correlation coefficient/outdir"

distance = "dcorr" #"pearson" #

filename = '.tcga_gtex.tpm.updown.feature_selection.csv'
# filename = '.tumor.tpm.updown.feature_selection.csv'
prefix = filename.split('.')[0]

mlDir = '/'.join([mldir, filename.split('.')[1]])
# labelsDir = '/'.join([labelsdir, filename.split('.')[1]])
# outdir = os.path.join(outdir, filename.split('.'))
outdir = '/'.join([out_dir, filename.split('.')[1], distance])
# dcorrDir = '/'.join([dcorr_dir, filename.split('.')[1]])

if not os.path.exists(os.path.join(outdir, prefix)):
    os.makedirs(os.path.join(outdir, prefix))
    

cancer_names = ["BLCA", "BRCA", "COAD", "KIRC", "LUAD", "LUSC", "STAD",  "PAAD"]#
distance_metrics = ["euclidean", "snn", "dcorr", "pearson"]

In [None]:
########################
####### summary  #######
########################

assess = dict(wgcna = dict(datasets = {}, SI = {}, CH = {}, DBI = {}, module_density_max = {})
              , pckmeans = dict(datasets = {}, SI = {}, CH = {}, DBI = {}, module_density_max = {}))

for cancer in cancer_names:
    prefix = cancer
    temp = pd.read_csv(os.path.join(outdir, prefix, prefix+filename.rstrip('.csv')+".ScC-WGCNA_vs_WGCNA.assess.csv"), sep=',', index_col=0)
    temp.to_dict()
    for key1 in ['wgcna', 'pckmeans']:
        assess[key1]['datasets'][cancer] = cancer
        for key2 in ['SI', 'CH', 'DBI', 'module_density_max']:
            assess[key1][key2][cancer] = temp[key1][key2]

temp1 = pd.DataFrame.from_dict(assess['pckmeans'])
temp1.columns = ['.'.join([x, 'ScC-WGCNA']) if x!='datasets' else x for x in temp1.columns if isinstance(x, str)]
temp2 = pd.DataFrame.from_dict(assess['wgcna'])
temp2.columns = ['.'.join([x, 'WGCNA']) if x!='datasets' else x for x in temp2.columns if isinstance(x, str)]
pd.merge(temp1, temp2, on='datasets', how='outer').to_csv(os.path.join(outdir, distance+".ScC-WGCNA_vs_WGCNA.assess.csv"), index=False)

module_density_max = {}
module_density_max['WGCNA'] = assess['wgcna']['module_density_max']
module_density_max['ScC-WGCNA'] = assess['pckmeans']['module_density_max']
moduleDensity_max = round(pd.DataFrame.from_dict(module_density_max, orient= 'index'), 4)
moduleDensity_max.to_csv(os.path.join(outdir, distance+".module_density_max.csv"), index=True)

ml_summary = {'Number of gene':{}, 'Number of gene pair':{}, 'Number of constrained gene':{}}
for cancer in cancer_names:
    prefix = cancer
    # 读入约束基因集，构建成对约束
    df = open(os.path.join(mlDir, prefix, prefix+".ml.txt"),'r')
    geneslist = df.readline().split("\t")
    geneslist.pop()#去掉最后的换行符
    ml_summary['Number of gene'][prefix] = len(geneslist)
    
    ml = set()
    byt = df.readlines()
    for index,value in enumerate(byt):
        temp = value.split("\t")
        temp.pop()#默认删除最后一个元素，并返回值
        #temp.remove("\n")#删除元素temp
        byt[index] = temp
        for i in range(len(temp)):
            for j in range(i+1, len(temp)):
                ml.add((geneslist.index(temp[i]),geneslist.index(temp[j])))
    ml_summary['Number of gene pair'][prefix] = len(ml)
    print("Number of pairwise constraints:%d\n" % len(ml))
    
    temp = []
    for u,v in ml:
        temp += [u,v]
    ml_summary['Number of constrained gene'][prefix] = round(len(set(temp))/len(geneslist), 4)
    print("Number of constrained gene:%d\n" % len(set(temp)))

pd.DataFrame.from_dict(ml_summary, orient= 'index').to_csv(os.path.join(outdir, distance+".summary of pairwise constraints.csv"), index=True)

### The visualization of the  evaluation score

In [None]:
###########################################################
### The visualization of the  evaluation score  ###
###########################################################
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(24, 8)
# plt.rcParams['xtick.labelsize']=12
# plt.rcParams['ytick.labelsize']=12

x = cancer_names #range(len(score_wgcna["SI"]))# 
ax1.plot(assess['wgcna']["SI"].keys(), assess['wgcna']["SI"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax1.plot(assess['pckmeans']["SI"].keys(), assess['pckmeans']["SI"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax1.set_xlabel('datasets', fontdict={'size': 16})
ax1.set_ylabel('Silhouette Coefficient Score', fontdict={'size': 16})
ax1.set_title("SI", fontdict={'size': 18})
ax1.legend(loc='best', fontsize = 12)
ax1.set_xticklabels(assess['wgcna']["SI"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax1.get_yticklabels(), fontsize=12)

ax2.plot(assess['wgcna']["CH"].keys(), assess['wgcna']["CH"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax2.plot(assess['pckmeans']["CH"].keys(), assess['pckmeans']["CH"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax2.set_xlabel('datasets', fontdict={'size': 16})
ax2.set_ylabel('Calinski Harabasz Score', fontdict={'size': 16})
ax2.set_title("CH", fontdict={'size': 18})
ax2.legend(loc='best', fontsize = 12)
ax2.set_xticklabels(assess['wgcna']["CH"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax2.get_yticklabels(), fontsize=12)

ax3.plot(assess['wgcna']["DBI"].keys(), assess['wgcna']["DBI"].values(), color = 'red'
         , linewidth = 1.0, linestyle = '-', label='WGCNA', marker='d', markerfacecolor='red', markersize=10)
ax3.plot(assess['pckmeans']["DBI"].keys(), assess['pckmeans']["DBI"].values(), color = 'green'
         , linewidth = 1.0, linestyle = '--', label='ScC-WGCNA', marker='^', markerfacecolor='green', markersize=10)
ax3.set_xlabel('datasets', fontdict={'size': 16})
ax3.set_ylabel('Davies Bouldin Score', fontdict={'size': 16})
ax3.set_title("DBI", fontdict={'size': 18})
ax3.legend(loc='best', fontsize = 12)
ax3.set_xticklabels(assess['wgcna']["DBI"].keys(), rotation = 30, fontsize = 12)
plt.setp(ax3.get_yticklabels(), fontsize=12)

plt.suptitle(("SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on datasets"), fontsize=20, fontweight='bold')
fig = plt.gcf()
fig.savefig(os.path.join(outdir, distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on datasets.png")
            , bbox_inches='tight', dpi=1080)
fig.savefig(os.path.join(outdir, distance+".SI,CH and DBI scores for WGCNA and ScC-WGCNA clustering on datasets.pdf"), bbox_inches='tight')
plt.show()