In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import time

# 定义自注意力层
class SelfAttention(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(input_dim, output_dim)
        self.key = nn.Linear(input_dim, output_dim)
        self.value = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        attention_scores = torch.matmul(q, k.transpose(-2, -1))
        attention_scores = attention_scores / (k.size(-1) ** 0.5)
        attention_weights = torch.softmax(attention_scores, dim=-1)

        output = torch.matmul(attention_weights, v)
        return output

# 定义多头注意力层
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, num_heads, output_dim):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.attention_heads = nn.ModuleList([
            SelfAttention(input_dim, output_dim // num_heads)
            for _ in range(num_heads)
        ])
        self.output_layer = nn.Linear(output_dim, output_dim)

    def forward(self, x):
        attention_outputs = [head(x) for head in self.attention_heads]
        concat_output = torch.cat(attention_outputs, dim=-1)
        output = self.output_layer(concat_output)
        return output
        
# 特征提取器
class FeatureExtractor(nn.Module):
    def __init__(self, in_channels):
        super(FeatureExtractor, self).__init__()
        self.conv = nn.Conv2d(in_channels, 1, kernel_size=1)

    def forward(self, x):
        print(f"Input shape: {x.shape}")
        x = self.conv(x)
        print(f"Output shape after conv: {x.shape}")
        x = x.squeeze(0)
        print(f"Output shape after squeeze: {x.shape}")
        return x

start_time = time.time()

# 读取DNA甲基化数据并进行特征选择
methy_data_list = []
target_dim = 512
num_heads = 8

for i in range(1, 23):
    file_path = f'../data/resampled_methy_data_by_chrom/resampled_methy_data_chr{i}.csv'
    methy_data = pd.read_csv(file_path, index_col=0)
    
    # 使用多头注意力层调整特征数
    methy_data_tensor = torch.tensor(methy_data.values, dtype=torch.float32)
    attention_layer = MultiHeadAttention(methy_data_tensor.size(1), num_heads, target_dim)
    methy_data_adjusted = attention_layer(methy_data_tensor)
    
    methy_data_list.append(methy_data_adjusted.detach().numpy())

file_path = '../data/resampled_methy_data_by_chrom/resampled_methy_data_chrX.csv'
methy_data_chrX = pd.read_csv(file_path, index_col=0)

# 使用多头注意力层调整特征数
methy_data_chrX_tensor = torch.tensor(methy_data_chrX.values, dtype=torch.float32)
attention_layer = MultiHeadAttention(methy_data_chrX_tensor.size(1), num_heads, target_dim)
methy_data_chrX_adjusted = attention_layer(methy_data_chrX_tensor)

methy_data_list.append(methy_data_chrX_adjusted.detach().numpy())

# 将数据转换为PyTorch Tensor,使用axis=0
methy_data_tensor = torch.tensor(np.stack(methy_data_list, axis=0), dtype=torch.float32)

# 创建特征提取器实例
feature_extractor = FeatureExtractor(in_channels=23)

# 将特征张量传递给特征提取器
compressed_methy_features = feature_extractor(methy_data_tensor)

# 读取miRNA数据并进行特征选择
mirna_data = pd.read_csv('../data/resampled_clinical _mirna_mina_data/resampled_mirna_data_svmsmote.csv', index_col=0)

# 使用多头注意力层调整特征数
mirna_data_tensor = torch.tensor(mirna_data.values, dtype=torch.float32)
attention_layer_mirna = MultiHeadAttention(mirna_data_tensor.size(1), num_heads, target_dim)
mirna_data_adjusted = attention_layer_mirna(mirna_data_tensor)

# 读取mRNA数据并进行特征选择
mrna_data = pd.read_csv('../data/resampled_clinical _mirna_mina_data/resampled_mrna_data_svmsmote.csv', index_col=0)

# 使用多头注意力层调整特征数
mrna_data_tensor = torch.tensor(mrna_data.values, dtype=torch.float32)
attention_layer_mrna = MultiHeadAttention(mrna_data_tensor.size(1), num_heads, target_dim)
mrna_data_adjusted = attention_layer_mrna(mrna_data_tensor)

end_time = time.time()
feature_extraction_time = end_time - start_time

print(f"Common Time of Feature Extraction: {feature_extraction_time:.2f} s")

# 打印调整后的miRNA和mRNA特征形状
print(f"Shape of adjusted miRNA features: {mirna_data_adjusted.shape}")
print(f"Shape of adjusted mRNA features: {mrna_data_adjusted.shape}")

# 将Tensor转换为NumPy数组
mirna_array = mirna_data_adjusted.detach().numpy()
mrna_array = mrna_data_adjusted.detach().numpy()
methy_array = compressed_methy_features.detach().numpy()

# 创建样本ID列表
sample_ids = [f"sample{i+1}" for i in range(mirna_array.shape[0])]

# 将NumPy数组转换为DataFrame
mirna_df = pd.DataFrame(mirna_array, index=sample_ids)
mrna_df = pd.DataFrame(mrna_array, index=sample_ids)
methy_df = pd.DataFrame(methy_array, index=sample_ids)

# 连接三个DataFrame
combined_df = pd.concat([mirna_df, mrna_df, methy_df], axis=1)

# 重新生成列名
num_mirna_features = mirna_df.shape[1]
num_mrna_features = mrna_df.shape[1]
num_methy_features = methy_df.shape[1]

mirna_columns = [f"mirna_{i}" for i in range(num_mirna_features)]
mrna_columns = [f"mrna_{i}" for i in range(num_mrna_features)]
methy_columns = [f"methy_{i}" for i in range(num_methy_features)]

combined_columns = mirna_columns + mrna_columns + methy_columns
combined_df.columns = combined_columns

# 读取临床数据
clinical_data = pd.read_csv('../data/resampled_clinical _mirna_mina_data/resampled_clinical_data_svmsmote.csv', low_memory=False, index_col=0)

# 根据样本ID将临床数据与合并后的特征数据对齐
aligned_data = pd.concat([clinical_data, combined_df], axis=1, join='inner')

# 提取特征和标签
X = aligned_data.drop('tumor_stage.diagnoses', axis=1)
y = aligned_data['tumor_stage.diagnoses']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 打印训练集和测试集的形状
print(f"Shape of training features: {X_train.shape}")
print(f"Shape of training labels: {y_train.shape}")
print(f"Shape of test features: {X_test.shape}")
print(f"Shape of test labels: {y_test.shape}")

# 将 DataFrame 转换为 NumPy 数组
X_train = X_train.values
X_test = X_test.values


Input shape: torch.Size([23, 444, 512])
Output shape after conv: torch.Size([1, 444, 512])
Output shape after squeeze: torch.Size([444, 512])
Common Time of Feature Extraction: 52.12 s
Shape of adjusted miRNA features: torch.Size([444, 512])
Shape of adjusted mRNA features: torch.Size([444, 512])
Shape of training features: (355, 1544)
Shape of training labels: (355,)
Shape of test features: (89, 1544)
Shape of test labels: (89,)


# KNN

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# 定义KNN分类器的超参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]
}

# 创建KNN分类器
knn_classifier = KNeighborsClassifier()

# 使用网格搜索和分层交叉验证优化超参数
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn_classifier, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)

# 打印最佳超参数和评估分数
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# 使用最佳超参数重新训练KNN分类器
best_knn_classifier = grid_search.best_estimator_

# 在整个数据集上进行预测
y_pred = best_knn_classifier.predict(X)

# 计算评估指标
accuracy = accuracy_score(y, y_pred)
precision_macro = precision_score(y, y_pred, average='macro')
precision_micro = precision_score(y, y_pred, average='micro')
recall_macro = recall_score(y, y_pred, average='macro')
recall_micro = recall_score(y, y_pred, average='micro')
f1_macro = f1_score(y, y_pred, average='macro')
f1_micro = f1_score(y, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)

Best parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'p': 1}
Best cross-validation score: 0.8784984678243106

Accuracy: 0.9459
Precision (Macro): 0.9452
Precision (Micro): 0.9459
Recall (Macro): 0.9449
Recall (Micro): 0.9459
F1-score (Macro): 0.9450
F1-score (Micro): 0.9459

Confusion Matrix:
[[ 78   0   0   0   0]
 [  0 117   0   1   0]
 [  0   0  92  11   0]
 [  0   3   9  63   0]
 [  0   0   0   0  70]]


# 随机森林

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# 创建随机森林分类器并设置参数
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_split=5, min_samples_leaf=2, random_state=42)

# 训练随机森林分类器
rf_classifier.fit(X, y)

# 在整个数据集上进行预测
y_pred = rf_classifier.predict(X)

# 计算评估指标
accuracy = accuracy_score(y, y_pred)
precision_macro = precision_score(y, y_pred, average='macro')
precision_micro = precision_score(y, y_pred, average='micro')
recall_macro = recall_score(y, y_pred, average='macro')
recall_micro = recall_score(y, y_pred, average='micro')
f1_macro = f1_score(y, y_pred, average='macro')
f1_micro = f1_score(y, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.9955
Precision (Macro): 0.9964
Precision (Micro): 0.9955
Recall (Macro): 0.9947
Recall (Micro): 0.9955
F1-score (Macro): 0.9955
F1-score (Micro): 0.9955

Confusion Matrix:
[[ 78   0   0   0   0]
 [  0 118   0   0   0]
 [  0   0 103   0   0]
 [  0   1   1  73   0]
 [  0   0   0   0  70]]


# XGBoost

In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier

# 创建XGBoost分类器并设置参数
xgb_classifier = XGBClassifier(n_estimators=100, max_depth=1, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, learning_rate=0.1, random_state=42)

# 训练XGBoost分类器
xgb_classifier.fit(X, y)

# 在整个数据集上进行预测
y_pred = xgb_classifier.predict(X)

# 计算评估指标
accuracy = accuracy_score(y, y_pred)
precision_macro = precision_score(y, y_pred, average='macro')
precision_micro = precision_score(y, y_pred, average='micro')
recall_macro = recall_score(y, y_pred, average='macro')
recall_micro = recall_score(y, y_pred, average='micro')
f1_macro = f1_score(y, y_pred, average='macro')
f1_micro = f1_score(y, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.9730
Precision (Macro): 0.9790
Precision (Micro): 0.9730
Recall (Macro): 0.9680
Recall (Micro): 0.9730
F1-score (Macro): 0.9718
F1-score (Micro): 0.9730

Confusion Matrix:
[[ 78   0   0   0   0]
 [  0 118   0   0   0]
 [  0   0 103   0   0]
 [  0   4   8  63   0]
 [  0   0   0   0  70]]


# Catboost

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from catboost import CatBoostClassifier

# 创建CatBoost分类器并设置参数
cat_classifier = CatBoostClassifier(iterations=300, depth=1, learning_rate=0.1, loss_function='MultiClass', random_seed=42,verbose=False)

# 训练CatBoost分类器
cat_classifier.fit(X, y)

# 在整个数据集上进行预测
y_pred = cat_classifier.predict(X)

# 计算评估指标
accuracy = accuracy_score(y, y_pred)
precision_macro = precision_score(y, y_pred, average='macro')
precision_micro = precision_score(y, y_pred, average='micro')
recall_macro = recall_score(y, y_pred, average='macro')
recall_micro = recall_score(y, y_pred, average='micro')
f1_macro = f1_score(y, y_pred, average='macro')
f1_micro = f1_score(y, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.9685
Precision (Macro): 0.9757
Precision (Micro): 0.9685
Recall (Macro): 0.9627
Recall (Micro): 0.9685
F1-score (Macro): 0.9668
F1-score (Micro): 0.9685

Confusion Matrix:
[[ 78   0   0   0   0]
 [  0 118   0   0   0]
 [  0   0 103   0   0]
 [  0   4  10  61   0]
 [  0   0   0   0  70]]


# SVM 

In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC

# 创建SVM分类器并设置参数
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# 训练SVM分类器
svm_classifier.fit(X, y)

# 在整个数据集上进行预测
y_pred = svm_classifier.predict(X)

# 计算评估指标
accuracy = accuracy_score(y, y_pred)
precision_macro = precision_score(y, y_pred, average='macro')
precision_micro = precision_score(y, y_pred, average='micro')
recall_macro = recall_score(y, y_pred, average='macro')
recall_micro = recall_score(y, y_pred, average='micro')
f1_macro = f1_score(y, y_pred, average='macro')
f1_micro = f1_score(y, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.6959
Precision (Macro): 0.7662
Precision (Micro): 0.6959
Recall (Macro): 0.6606
Recall (Micro): 0.6959
F1-score (Macro): 0.6300
F1-score (Micro): 0.6959

Confusion Matrix:
[[  7  71   0   0   0]
 [  0 117   0   0   1]
 [  1   2  86  14   0]
 [  0  12  34  29   0]
 [  0   0   0   0  70]]


# NN 

In [7]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


# 创建神经网络模型
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(len(set(y)), activation='softmax')
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.1, verbose=0)

# 在测试集上进行预测
y_pred = model.predict(X_test)
y_pred = y_pred.argmax(axis=1)

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
precision_micro = precision_score(y_test, y_pred, average='micro') 
recall_macro = recall_score(y_test, y_pred, average='macro')
recall_micro = recall_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro') 
f1_micro = f1_score(y_test, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}") 
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}") 
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.8539
Precision (Macro): 0.8544
Precision (Micro): 0.8539
Recall (Macro): 0.8505
Recall (Micro): 0.8539
F1-score (Macro): 0.8506
F1-score (Micro): 0.8539

Confusion Matrix:
[[12  0  0  0  0]
 [ 0 23  1  0  0]
 [ 0  1 12  4  0]
 [ 0  3  4 10  0]
 [ 0  0  0  0 19]]


# CNN

In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten


# 调整输入数据的形状以适应 CNN
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# 创建 CNN 模型
model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(len(set(y)), activation='softmax')
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# 在测试集上进行预测
y_pred = model.predict(X_test)
y_pred = y_pred.argmax(axis=1)

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
precision_micro = precision_score(y_test, y_pred, average='micro') 
recall_macro = recall_score(y_test, y_pred, average='macro')
recall_micro = recall_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro') 
f1_micro = f1_score(y_test, y_pred, average='micro')

# 打印评估指标
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}") 
print(f"Precision (Micro): {precision_micro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Micro): {recall_micro:.4f}") 
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.8315
Precision (Macro): 0.8270
Precision (Micro): 0.8315
Recall (Macro): 0.8235
Recall (Micro): 0.8315
F1-score (Macro): 0.8036
F1-score (Micro): 0.8315

Confusion Matrix:
[[12  0  0  0  0]
 [ 0 24  0  0  0]
 [ 1  0 14  2  0]
 [ 0  3  9  5  0]
 [ 0  0  0  0 19]]
