In [1]:
file_path = "../../results/1/S5/unique_methods_protein.txt"
output_path = "../../results/1/S5/unique_methods_protein_features_added.txt"

In [2]:
import sys
print(sys.version)  # 输出详细版本信息（包括构建日期、编译器版本等）
# 或
print(sys.version_info)  # 输出结构化版本信息（如 major, minor, micro）

import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight
import numpy as np

# 1. 导入数据
df = pd.read_csv(file_path, sep='\t')

# 2. 基于序列长度的特征
df['seq_length'] = df['Seq'].apply(len)

# 3. 分子量计算
def calc_mw(seq):
    """计算蛋白质分子量（Da）"""
    try:
        return molecular_weight(seq, seq_type='protein', monoisotopic=False)
    except:
        return np.nan

df['molecular_weight'] = df['Seq'].apply(calc_mw)

# 4. 氨基酸组成（AAC）
def get_aa_composition(seq):
    """计算20种标准氨基酸频率"""
    aa_list = 'ACDEFGHIKLMNPQRSTVWY'
    total = len(seq)
    return {aa: seq.count(aa)/total for aa in aa_list}

aa_df = df['Seq'].apply(get_aa_composition).apply(pd.Series)
df = pd.concat([df, aa_df.add_prefix('AA_')], axis=1)

# 5. 二肽频率计算
def get_dipeptide_freq(seq):
    dipeptides = [seq[i:i+2] for i in range(len(seq)-1)]
    total = len(dipeptides)
    unique_di = set(dipeptides)
    return {di: dipeptides.count(di)/total for di in unique_di}

di_df = df['Seq'].apply(get_dipeptide_freq).apply(pd.Series).fillna(0)
df = pd.concat([df, di_df.add_prefix('Di_')], axis=1)

# 6. 三肽频率计算
def get_tripeptide_freq(seq):
    tripeptides = [seq[i:i+3] for i in range(len(seq)-2)]
    total = len(tripeptides) if tripeptides else 1
    unique_tri = set(tripeptides)
    return {tri: tripeptides.count(tri)/total for tri in unique_tri}

tri_df = df['Seq'].apply(get_tripeptide_freq).apply(pd.Series).fillna(0)
df = pd.concat([df, tri_df.add_prefix('Tri_')], axis=1)

# 7. 理化属性计算
def calc_physicochemical(seq):
    try:
        pa = ProteinAnalysis(seq)
        return pd.Series({
            'isoelectric_point': pa.isoelectric_point(),
            'gravy': pa.gravy(),  # 疏水性（GRAVY指数）
            'charge_pH7': pa.charge_at_pH(7.0)  # pH7时的净电荷
        })
    except:
        return pd.Series({
            'isoelectric_point': np.nan,
            'gravy': np.nan,
            'charge_pH7': np.nan
        })

physicochemical = df['Seq'].apply(calc_physicochemical)
df = pd.concat([df, physicochemical], axis=1)

# 8. 保存结果
df.to_csv(output_path, sep='\t', index=False)

print("特征工程完成! 结果已保存至:", output_path)
print("生成的特征维度:", df.shape)

3.9.12 | packaged by conda-forge | (main, Mar 24 2022, 23:22:55) 
[GCC 10.3.0]
sys.version_info(major=3, minor=9, micro=12, releaselevel='final', serial=0)


  from pandas.core.computation.check import NUMEXPR_INSTALLED


特征工程完成! 结果已保存至: ../../results/1/S5/unique_methods_protein_features_added.txt
生成的特征维度: (36737, 8499)
