In [None]:
import os
os.chdir('F:\Work\Experiment\pLM4ACE\model')

### PCA

In [None]:
# 使用主成分分析选择最佳的主成分数量
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 加载数据
# 假设我们的数据存储在'peptide_sequences.csv'文件中，每一行代表一个肽，每一列代表一个特征
data = pd.read_csv(r'fusion_features\Data\fusion\All_features.csv', header=None, index_col=False)

# 预处理数据
# 假设我们的数据没有缺失值，我们只需要进行标准化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# 进行PCA，但这次我们不指定主成分的数量，这样PCA会保留所有的主成分
pca = PCA()
pca.fit(data_scaled)

# 计算累积解释方差比例
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# 生成累积解释方差比例图
plt.figure(figsize=(10, 7))
plt.plot(range(1, len(cumulative_variance_ratio)+1), cumulative_variance_ratio)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 加载数据
# 假设我们的数据存储在'peptide_sequences.csv'文件中，每一行代表一个肽，每一列代表一个特征
data = pd.read_csv(r'fusion_features\Data\fusion\All_features.csv', header=None, index_col=False)

# 预处理数据
# 假设我们的数据没有缺失值，我们只需要进行标准化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# 进行PCA
pca = PCA(n_components=800)  # 我们将保留前n个主成分
principalComponents = pca.fit_transform(data_scaled)

# 查看结果
print('Explained variance ratio:', pca.explained_variance_ratio_)

# 将主成分转换为DataFrame
principalDf = pd.DataFrame(data=principalComponents)

# 保存为CSV文件
principalDf.to_csv(r'fusion_features\Data\features_select\PCA_All.csv', index=False, header=None)

### mRMR

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

X = pd.read_csv(r"fusion_features\Data\fusion\All_features.csv", index_col=None, header=None)
y = pd.read_csv(r"fusion_features\Data\label.csv", index_col=None, header=None)

print(X.shape)
print(y.shape)
print(np.count_nonzero(y==0))
print(np.count_nonzero(y==1))

In [None]:
# select top n features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=X, y=y, K=1000)

# 选择所有特征
X_selected = X[selected_features]

# 转换为DataFrame
df = pd.DataFrame(X_selected)

# 保存为CSV文件
df.to_csv(r'fusion_features\Data\features_select\mRMR_All_1000.csv', index=False, header=None)

In [None]:
# 测试
K = pd.read_csv(r"fusion_features\Data\features_select\mRMR_All_1000.csv", index_col=None, header=None)
print(K.shape)

### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV
import pandas as pd
import numpy as np
from numpy import loadtxt

# 加载数据
X = pd.read_csv(r"fusion_features\Data\fusion\All_features.csv", index_col=None, header=None)
y = loadtxt('fusion_features\Data\label.csv', delimiter=',')

print(X.shape)
print(y.shape)
print(np.count_nonzero(y==0))
print(np.count_nonzero(y==1))

# 定义弹性网络交叉验证模型
model_cv = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], alphas=[0.01, 0.1, 1.0, 10.0], cv=10)

# 训练模型
model_cv.fit(X, y)

# 输出最佳的alpha和l1_ratio
print('Best alpha: ', model_cv.alpha_)
print('Best l1_ratio: ', model_cv.l1_ratio_)

# 使用最佳参数训练模型
model = ElasticNet(alpha=model_cv.alpha_, l1_ratio=model_cv.l1_ratio_)
model.fit(X, y)

# 获取特征重要性
importance = model.coef_


In [None]:
# 获取重要特征的索引
important_features_indices = [i for i, coef in enumerate(importance) if coef != 0]

# 创建只包含重要特征的数据集
X_important = X.iloc[:, important_features_indices]
print(X_important)

# 转换为DataFrame
df = pd.DataFrame(X_important)

# 保存为CSV文件
df.to_csv(r'fusion_features\Data\features_select\EN_All.csv', index=False, header=None)

### 结尾