In [1]:
import numpy as np
import pandas as pd
from util.util import DataSet
from util.get_sn import get_sn
from util.model import CPMNets
import util.classfiy as classfiy
from sklearn.model_selection import StratifiedKFold
import os
import warnings
import time
import psutil
import math
from util.utils2 import evaluate_model
from numpy.random import shuffle


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:

# 禁用警告
warnings.filterwarnings('ignore')


In [22]:

# 实验配置
scenario = 1
signal_prop = "high"
signal_level = "high"
ratio = 0.75
Normal = 0
n_folds = 4  # 四折交叉验证
num_class = 3  # 假设有3个类别

# 初始化结果记录表
metrics = {
    "Batch": [], "Fold": [], "Accuracy": [], "F1 Macro": [], "F1 Micro": [], "F1 Weighted": [],
    "Precision Macro": [], "Precision Micro": [], "Precision Weighted": [],
    "Recall Macro": [], "Recall Micro": [], "Recall Weighted": [],
    # "AUC Macro": [], "AUC Micro": [], "AUC Weighted": [],
    # "AUPR Macro": [], "AUPR Micro": [], "AUPR Weighted": [], 
    "Cohen Kappa": [], "Training_time": [], "CPU_memory_allocated": []
}
all_predictions_df = pd.DataFrame()

In [None]:

# 主循环：处理每个batch (1-25)
for batch_num in range(1, 26):
    print(f"\n=== 处理 Batch {batch_num} ===")
    
    # 构建数据路径
    sim_path = f"/public/home/yilab5/Master/methodology/simdata2/scenario1/k-3/prop-high/level-{signal_level}/sim{batch_num}"

    # s{scenario}-k3-{signal_prop}-{signal_level}-batch{batch_num}-mrna.csv

    try:
        # 加载omics数据
        omics1 = np.loadtxt(f"{sim_path}/s{scenario}-k3-{signal_prop}-{signal_level}-batch{batch_num}-mrna.csv", delimiter=',', skiprows=1).T
        omics2 = np.loadtxt(f"{sim_path}/s{scenario}-k3-{signal_prop}-{signal_level}-batch{batch_num}-meth.csv", delimiter=',', skiprows=1).T
        omics3 = np.loadtxt(f"{sim_path}/s{scenario}-k3-{signal_prop}-{signal_level}-batch{batch_num}-prot.csv", delimiter=',', skiprows=1).T

        labels = np.loadtxt(f"{sim_path}/s{scenario}-k3-{signal_prop}-{signal_level}-batch{batch_num}-label.csv",
                             delimiter=',', skiprows=1).reshape(-1, 1).astype(int)
        
        if min(labels)[0] == 0:
            labels = labels + 1
        # omics1 = np.loadtxt(f"/public/home/yilab5/Master/methodology/simdata2/scenario1/k-3/prop-high/level-high/sim{batch_num}/s1-k3-high-high-batch{batch_num}-mrna.csv", 
        #            delimiter=',', skiprows=1).T
        # omics2 = np.loadtxt(f"/public/home/yilab5/Master/methodology/simdata2/scenario1/k-3/prop-high/level-high/sim{batch_num}/s1-k3-high-high-batch{batch_num}-meth.csv", 
        #            delimiter=',', skiprows=1).T
        # omics3 = np.loadtxt(f"/public/home/yilab5/Master/methodology/simdata2/scenario1/k-3/prop-high/level-high/sim{batch_num}/s1-k3-high-high-batch{batch_num}-prot.csv", 
        #            delimiter=',', skiprows=1).T

        # labels = np.loadtxt(f"/public/home/yilab5/Master/methodology/simdata2/scenario1/k-3/prop-high/level-high/sim{batch_num}/s1-k3-high-high-batch{batch_num}-label.csv", 
        #            delimiter=',', skiprows=1).reshape(-1, 1).astype(int)

        # 创建数据字典
        n_views = 3
        X = np.zeros((1, n_views), dtype=object)
        X[0, 0] = omics1.astype(np.float32)
        X[0, 1] = omics2.astype(np.float32)
        X[0, 2] = omics3.astype(np.float32)
        
        data = {'X': X, 'gt': labels}
        
        # 准备交叉验证
        view_number = data['X'].shape[1]
        X_views = np.split(data['X'], view_number, axis=1)
        
        # 提取视图数据
        view_data = []
        for v in range(view_number):
            view_data.append(X_views[v][0][0].transpose())
        
        # 创建分层K折交叉验证
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # 对每一折进行交叉验证
        for fold, (train_index, test_index) in enumerate(skf.split(view_data[0], labels.flatten())):
            print(f"  - 处理第 {fold+1} 折")
            
            # 初始化训练集和测试集
            X_train = []
            X_test = []
            labels_train = labels[train_index]
            labels_test = labels[test_index]
            
            # 处理每个视图的数据
            for v_num in range(view_number):
                # 提取当前视图的训练集和测试集
                X_train_view = view_data[v_num][train_index]
                X_test_view = view_data[v_num][test_index]
                
                # 归一化处理
                if Normal == 1:
                    # 计算训练集的统计量
                    m = np.mean(X_train_view, axis=0)
                    mx = np.max(X_train_view, axis=0)
                    mn = np.min(X_train_view, axis=0)
                    range_val = mx - mn
                    range_val[range_val == 0] = 1  # 避免除零错误
                    
                    # 归一化训练集和测试集
                    X_train_view = (X_train_view - m) / range_val
                    X_test_view = (X_test_view - m) / range_val
                
                X_train.append(X_train_view)
                X_test.append(X_test_view)
            
            # 创建数据集对象
            traindata = DataSet(X_train, view_number, np.array(labels_train))
            testdata = DataSet(X_test, view_number, np.array(labels_test))
            trainData, testData, view_num = traindata, testdata, view_number
            
            # 准备模型参数
            outdim_size = [trainData.data[str(i)].shape[1] for i in range(view_num)]
            layer_size = [[150, outdim_size[i]] for i in range(view_num)]
            
            # 生成Sn（缺失模式矩阵）
            n_total = trainData.num_examples + testData.num_examples
            Sn = get_sn(view_num, n_total, 0)  # 缺失率为0
            Sn_train = Sn[np.arange(trainData.num_examples)]
            Sn_test = Sn[np.arange(testData.num_examples) + trainData.num_examples]
            
            # 模型参数
            lsd_dim = 128
            lamb = 1.
            # epoch = [10, 10]
            epoch = [50, 50]
            learning_rate = [0.01, 0.01]
            
            # 记录训练开始时间和内存
            start_time = time.time()
            # 获取当前进程
            process = psutil.Process(os.getpid())
            # mem_before = psutil.virtual_memory().used
            mem_before = process.memory_info().rss
            # mem_before = 0
            # 创建并训练模型
            model = CPMNets(view_num, trainData.num_examples, testData.num_examples,layer_size, lsd_dim, learning_rate, lamb)
            
            model.train(trainData.data, Sn_train, trainData.labels.reshape(trainData.num_examples), epoch[0])
            H_train = model.get_h_train()
            
            model.test(testData.data, Sn_test, testData.labels.reshape(testData.num_examples), epoch[1])
            H_test = model.get_h_test()
            
            # 记录训练结束时间和内存
            end_time = time.time()
            # mem_after = psutil.virtual_memory().used

            mem_after = process.memory_info().rss
            training_time = end_time - start_time
            cpu_memory_allocated = mem_after - mem_before
            
            # 预测和评估
            label_pred, label_prob = classfiy.ave(H_train, H_test, trainData.labels)
            scores = evaluate_model(testData.labels, label_pred, label_prob, num_class)
            
            # 记录结果
            metrics["Batch"].append(batch_num)
            metrics["Fold"].append(fold+1)
            metrics["Training_time"].append(training_time)
            metrics["CPU_memory_allocated"].append(cpu_memory_allocated)
            
            # 添加评估指标
            for key in scores:
                if key in metrics:
                    metrics[key].append(scores[key])
                    # 创建当前批次和折次的预测结果 DataFrame
            fold_predictions = pd.DataFrame({
                "Batch": [batch_num] * len(test_index),
                "Fold": [fold+1] * len(test_index),
                "True_Label": testData.labels.flatten(),
                "Pred_Label": label_pred,
            })
            
            # 添加每个类别的概率列
            for i in range(label_prob.shape[1]):
                fold_predictions[f"Prob_Class_{i}"] = np.round(label_prob[:, i], 4)
            
            # 将当前折次的预测结果添加到总 DataFrame
            all_predictions_df = pd.concat([all_predictions_df, fold_predictions], ignore_index=True)

            print(f"    训练时间: {training_time:.2f}秒, 内存使用: {cpu_memory_allocated/1024/1024:.2f}MB")
            print(f"    准确率: {scores['Accuracy']:.4f}")
        
    except Exception as e:
        print(f"处理 Batch {batch_num} 时出错: {str(e)}")
        continue

# 保存结果到DataFrame
metrics_df = pd.DataFrame(metrics)

# 保存到CSV文件
output_dir = "/public/home/yilab5/Master/methodology/result/result_CPM/res-simdata2"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "cpmnets_cross_validation_results-high.csv")
metrics_df.to_csv(output_path, index=False)
# 保存所有预测结果到另一个 CSV

predictions_path = os.path.join(output_dir, "cpmnets_all_predictions-high.csv")
all_predictions_df.to_csv(predictions_path, index=False)

print("\n=== 所有批次和折次处理完成 ===")
print(f"结果已保存至: {output_path}")


=== 处理 Batch 1 ===
  - 处理第 1 折
Epoch : 1  ===> Reconstruction Loss = 89195.2031, Classification Loss = 279.6752 
Epoch : 2  ===> Reconstruction Loss = 62954.1055, Classification Loss = 236.8226 
Epoch : 3  ===> Reconstruction Loss = 53773.5234, Classification Loss = 211.4652 
Epoch : 4  ===> Reconstruction Loss = 46523.4609, Classification Loss = 189.3281 
Epoch : 5  ===> Reconstruction Loss = 41392.4492, Classification Loss = 155.1659 
Epoch : 6  ===> Reconstruction Loss = 36990.0547, Classification Loss = 118.1374 
Epoch : 7  ===> Reconstruction Loss = 33788.6328, Classification Loss = 81.2089 
Epoch : 8  ===> Reconstruction Loss = 30564.5078, Classification Loss = 52.5948 
Epoch : 9  ===> Reconstruction Loss = 27829.3535, Classification Loss = 40.1152 
Epoch : 10  ===> Reconstruction Loss = 26629.7969, Classification Loss = 35.7648 
Epoch : 11  ===> Reconstruction Loss = 24864.7344, Classification Loss = 28.5873 
Epoch : 12  ===> Reconstruction Loss = 24129.7266, Classification Los