In [None]:
# Current directory
import os
os.chdir('E:\work\mine')

In [None]:
import numpy as np
import pandas as pd

X_new=pd.read_csv(r"extract\All_feature\features\TPC.csv", header=None)
y_new= pd.read_csv(r'extract\label.csv', header=None)

X_new = X_new.iloc[:,:]
print(X_new.shape)
print(y_new.shape)
X_new = np.array(X_new)
y_new = np.array(y_new)

In [None]:
from sklearn.preprocessing import StandardScaler

# 使用 StandardScaler 进行标准化
scaler = StandardScaler()
X_new = scaler.fit_transform(X_new)

In [None]:
# dataset splitting
from sklearn.model_selection import train_test_split
X_train_whole, X_ind_test, y_train_whole, y_ind_test = train_test_split(X_new, y_new, test_size=0.2, random_state=1111)

print(X_train_whole.shape)
print(X_ind_test.shape)


In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

# 定义PLS模型的参数范围
parameters = {
    'n_components': range(1, 20, 1),  # 成分数量
    'scale': [True, False],  # 是否进行标准化
    'max_iter': [100, 200, 300],  # 最大迭代次数
    'tol': [0.0001, 0.001, 0.01]  # 收敛阈值
}

# 初始化PLS模型
pls = PLSRegression()

# 初始化十折网格搜索
grid_search = GridSearchCV(estimator=pls, param_grid=parameters, cv=10, scoring='neg_mean_squared_error')

# 在训练数据上进行网格搜索
grid_search.fit(X_train_whole, y_train_whole)

# 获取网格搜索结果
best_params = grid_search.best_params_
best_mean_score = -grid_search.best_score_  # 注意要取负号，因为scoring设置为neg_mean_squared_error
best_std_score = np.std(grid_search.cv_results_['std_test_score'][grid_search.best_index_])

# 打印结果
print("Best parameters:", best_params)
print("Best mean squared error:", best_mean_score)
print("Standard deviation of best mean squared error:", best_std_score)


### 10折交叉验证

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split 
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.cross_decomposition import PLSRegression
import math 
import numpy as np
import statistics
import matplotlib.pyplot as plt

# Split dataset into training and independent test sets
X_train_whole, X_ind_test, y_train_whole, y_ind_test = train_test_split(X_new, y_new, test_size=0.2, random_state=1111)

# Result collection lists
BACC_collection = []
AAC_collection = []
Sn_collection = []
Sp_collection = []
MCC_collection = []
AUC_collection = []
AP = []

# Initialize lists to store predictions and true values for each fold
y_true_all = []
y_pred_all = []
y_pred_proba_all = []

mean_recall = np.linspace(0, 1, 100)
all_precision = []
base_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0
interp_tpr_collection = []

pls = PLSRegression(max_iter=100, n_components=1, scale=False, tol=0.0001)
skf = StratifiedKFold(n_splits=10)

for train, test in skf.split(X_train_whole, y_train_whole):
    X_train, X_valid, y_train, y_valid = X_train_whole[train], X_train_whole[test], y_train_whole[train], y_train_whole[test]
    pls.fit(X_train, y_train)

 
    y_pred = pls.predict(X_valid).ravel()
    y_pred_proba = y_pred  
    y_pred_proba = np.clip(y_pred_proba, 0, 1)
    y_valid_pred = np.where(y_pred_proba > 0.5, 1, 0)

    # Save predictions and true values for this fold
    y_true_all.extend(y_valid.ravel().astype(int))
    y_pred_all.extend(y_valid_pred.astype(int))
    y_pred_proba_all.extend(y_pred_proba)

    TP, FP, FN, TN = confusion_matrix(y_valid, y_valid_pred).ravel()
    Sn_collection.append(TP / (TP + FN))
    AAC_collection.append((TP + TN) / (TP + TN + FP + FN))
    Sp_collection.append(TN / (TN + FP))
    MCC = (TP * TN - FP * FN) / math.pow(((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)), 0.5)
    MCC_collection.append(MCC)
    BACC_collection.append(0.5 * TP / (TP + FN) + 0.5 * TN / (TN + FP))
    auc = roc_auc_score(y_valid, y_pred)
    AUC_collection.append(auc)
    
    fpr, tpr, _ = roc_curve(y_valid, y_pred)
    interp_tpr = np.interp(base_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    interp_tpr_collection.append(interp_tpr)
    
    precision, recall, _ = precision_recall_curve(y_valid, y_pred)
    average_precision = average_precision_score(y_valid, y_pred)
    recall = np.flipud(recall)
    precision = np.flipud(precision)


results_df = pd.DataFrame({
    'Predicted_Proba': np.round(y_pred_proba_all, 8),  
    'True_Label': y_true_all,
    'Predicted_Label': y_pred_all
})

# Save the results to a CSV file
# results_df.to_csv('Result\PLS\\10折/PLS_AAC.csv', index=False)

# Print results
print(round(statistics.mean(AAC_collection), 3), '±', round(statistics.stdev(AAC_collection), 3))
print(round(statistics.mean(Sn_collection), 3), '±', round(statistics.stdev(Sn_collection), 3))
print(round(statistics.mean(Sp_collection), 3), '±', round(statistics.stdev(Sp_collection), 3))
print(round(statistics.mean(MCC_collection), 3), '±', round(statistics.stdev(MCC_collection), 3))
print(round(statistics.mean(AUC_collection), 3), '±', round(statistics.stdev(AUC_collection), 3))


### 独立测试

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import math
import statistics
import pandas as pd

# result collection list
BACC_collection = []
AAC_collection = []
Sn_collection = []
Sp_collection = []
MCC_collection = []
AUC_collection = []

y_true_all = []
y_pred_all = []
y_pred_proba_all = []

def categorical_probas_to_classes(p):
    return np.argmax(p, axis=1)

for i in range(10):
  
    X_train_whole, X_ind_test, y_train_whole, y_ind_test = train_test_split(X_new, y_new, test_size=0.2, random_state=i)
    pls = PLSRegression(max_iter=100, n_components=15, scale=False, tol=0.0001)  # PLS model
    pls.fit(X_train_whole, y_train_whole)   # fitting model
    
    # get predicted values
    y_pred_score = pls.predict(X_ind_test)
    y_pred_proba = np.clip(y_pred_score, 0, 1)
    y_pred = np.where(y_pred_score > 0.5, 1, 0)    
    y_true = y_ind_test                
    
    # Save predictions and true values for this fold
    y_true_all.extend(y_true.ravel().astype(int))
    y_pred_all.extend(y_pred.astype(int))
    y_pred_proba_all.extend(y_pred_proba.ravel())
    
    # Calculate metrics
    TP, FP, FN, TN = confusion_matrix(y_true, y_pred).ravel()
    Sn_collection.append(TP / (TP + FN))
    AAC_collection.append((TP + TN) / (TP + TN + FP + FN))
    Sp_collection.append(TN / (TN + FP))
    MCC = (TP * TN - FP * FN) / math.pow(((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)), 0.5)
    MCC_collection.append(MCC)
    BACC_collection.append(0.5 * TP / (TP + FN) + 0.5 * TN / (TN + FP))
    auc = roc_auc_score(y_true, y_pred_score)
    AUC_collection.append(auc)

# Combine all predictions and true values into a DataFrame
results_df = pd.DataFrame({
    'Predicted_Proba': np.round(y_pred_proba_all, 8), 
    'True_Label': y_true_all,
    'Predicted_Label': np.array(y_pred_all).flatten()
})

# Save the results to a CSV file
# results_df.to_csv('Result\PLS\独立测试/PLS_G5.csv', index=False)

# Print mean and standard deviation of collected metrics
print(round(statistics.mean(AAC_collection), 3), '±', round(statistics.stdev(AAC_collection), 3))
print(round(statistics.mean(Sn_collection), 3), '±', round(statistics.stdev(Sn_collection), 3))
print(round(statistics.mean(Sp_collection), 3), '±', round(statistics.stdev(Sp_collection), 3))
print(round(statistics.mean(MCC_collection), 3), '±', round(statistics.stdev(MCC_collection), 3))
print(round(statistics.mean(AUC_collection), 3), '±', round(statistics.stdev(AUC_collection), 3))
