## create env

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, save_npz, load_npz
from tqdm import tqdm
from collections import Counter

# constant
K_MAX = 6

# load data
df = pd.read_excel("./Data/peptides10.xlsx")

# construct amino to index map
amino_acid_index = {
    'A': 0,  # Alanine
    'R': 1,  # Arginine
    'N': 2,  # Asparagine
    'D': 3,  # Aspartic acid
    'C': 4,  # Cysteine
    'E': 5,  # Glutamic acid
    'Q': 6,  # Glutamine
    'G': 7,  # Glycine
    'H': 8,  # Histidine
    'I': 9,  # Isoleucine
    'L': 10, # Leucine
    'K': 11, # Lysine
    'M': 12, # Methionine
    'F': 13, # Phenylalanine
    'P': 14, # Proline
    'S': 15, # Serine
    'T': 16, # Threonine
    'W': 17, # Tryptophan
    'Y': 18, # Tyrosine
    'V': 19  # Valine
}

# preprocess data
peptides = df['Unnamed: 0']
peptides.column = ['Sequence']

peptides = peptides.str.replace(' ', '', regex=False)
peptides = peptides.str[1:-1]


# 存储和读取稀疏矩阵
def save_sparse_matrix(mat: pd.DataFrame, path: str) -> None:
    # 将pd.Dataframe保存为.npz文件
    mat = coo_matrix(mat.values)
    save_npz(path, mat)


def load_sparse_matrix(path: str) -> pd.DataFrame:
    # 读取.npz文件并转换为pd.DataFrame
    mat = load_npz(path).toarray()
    return pd.DataFrame(mat)

  warn(msg)


## AAC

In [10]:
def extract_aac_features(peptides: pd.Series) -> pd.DataFrame:
    aac_mat = []
    for i, peptide in enumerate(peptides):
        peptide_aac = [0] * 20
        amino_count = Counter(peptide)
        for amino in amino_count:
            amino_id = amino_acid_index[amino]
            peptide_aac[amino_id] = amino_count[amino] / len(peptide)  # 计算每个氨基酸出现的频率
        aac_mat.append(peptide_aac)
    aac_df = pd.DataFrame(aac_mat)
    return aac_df

aac_df = extract_aac_features(peptides)
save_sparse_matrix(aac_df, './Cache/aac.npz')  # 不保存第一列

aac_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.125,0.000,0.125,0.000,0.000,0.000,0.000,0.375,0.0,0.000,0.000,0.125,0.125,0.000,0.000,0.125,0.000,0.0,0.000,0.000
1,0.000,0.000,0.125,0.125,0.125,0.125,0.000,0.000,0.0,0.125,0.000,0.125,0.000,0.000,0.000,0.000,0.000,0.0,0.250,0.000
2,0.000,0.000,0.000,0.250,0.125,0.125,0.000,0.000,0.0,0.125,0.000,0.125,0.000,0.000,0.000,0.000,0.000,0.0,0.250,0.000
3,0.000,0.000,0.000,0.250,0.125,0.125,0.000,0.000,0.0,0.125,0.125,0.125,0.000,0.000,0.000,0.000,0.000,0.0,0.125,0.000
4,0.125,0.000,0.125,0.000,0.000,0.000,0.000,0.375,0.0,0.000,0.125,0.125,0.000,0.000,0.000,0.125,0.000,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18578,0.125,0.000,0.000,0.000,0.000,0.000,0.125,0.125,0.0,0.125,0.125,0.125,0.000,0.000,0.000,0.000,0.125,0.0,0.000,0.125
18579,0.250,0.125,0.000,0.000,0.000,0.000,0.000,0.000,0.0,0.000,0.125,0.000,0.000,0.000,0.125,0.250,0.125,0.0,0.000,0.000
18580,0.000,0.125,0.125,0.000,0.000,0.000,0.125,0.125,0.0,0.000,0.125,0.000,0.000,0.000,0.000,0.125,0.000,0.0,0.250,0.000
18581,0.125,0.125,0.250,0.000,0.000,0.000,0.125,0.000,0.0,0.125,0.125,0.000,0.000,0.000,0.000,0.000,0.125,0.0,0.000,0.000


## PSSM

In [15]:
from Bio import Align
from Bio.Align import substitution_matrices, AlignInfo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

In [16]:
# 将氨基酸序列列表转换为SeqRecord对象
seq_records = [SeqRecord(Seq(seq), id=f"Seq{i+1}") for i, seq in enumerate(peptides.values.tolist())]

# 创建多序列比对对象
alignment = MultipleSeqAlignment(seq_records)

# 创建SummaryInfo对象
summary = AlignInfo.SummaryInfo(alignment)

# 计算PSSM
pssm = summary.pos_specific_score_matrix()

>>> alignment = msa.alignment
>>> from Bio.motifs import Motif
>>> motif = Motif('ACGT', alignment)
>>> counts = motif.counts

The `counts` object contains the same information as the PSSM returned by `pos_specific_score_matrix`, but note that the indices are reversed:

>>> counts[letter][i] == pssm[index][letter]
True

If your multiple sequence alignment object was obtained using Bio.AlignIO, then you can obtain a new-style Alignment object directly by using Bio.Align.read instead of Bio.AlignIO.read, or Bio.Align.parse instead of Bio.AlignIO.parse.
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from Bio.Align import MultipleSeqAlignment
>>> from Bio.Align.AlignInfo import SummaryInfo
>>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),
...                             SeqRecord(Seq('ATGT')),
...                             SeqRecord(Seq('ATGT'))])
>>> summary = SummaryInfo(msa)
>>> dumb_consensus = summary.dumb_consensus(ambiguous='N')
>>> print(dumb_consen

In [18]:
for s in pssm:
    print(s)

{'A': 1127.0, 'C': 246.0, 'D': 742.0, 'E': 406.0, 'F': 777.0, 'G': 1149.0, 'H': 787.0, 'I': 783.0, 'K': 1019.0, 'L': 954.0, 'M': 871.0, 'N': 757.0, 'P': 811.0, 'Q': 961.0, 'R': 1901.0, 'S': 1702.0, 'T': 1121.0, 'V': 1220.0, 'W': 568.0, 'Y': 681.0}
{'A': 1816.0, 'C': 93.0, 'D': 102.0, 'E': 108.0, 'F': 532.0, 'G': 529.0, 'H': 394.0, 'I': 801.0, 'K': 139.0, 'L': 1318.0, 'M': 495.0, 'N': 129.0, 'P': 5486.0, 'Q': 161.0, 'R': 1630.0, 'S': 609.0, 'T': 388.0, 'V': 2494.0, 'W': 637.0, 'Y': 722.0}
{'A': 2127.0, 'C': 108.0, 'D': 235.0, 'E': 449.0, 'F': 792.0, 'G': 880.0, 'H': 669.0, 'I': 307.0, 'K': 603.0, 'L': 2054.0, 'M': 895.0, 'N': 680.0, 'P': 207.0, 'Q': 879.0, 'R': 3081.0, 'S': 1776.0, 'T': 369.0, 'V': 668.0, 'W': 948.0, 'Y': 856.0}
{'A': 2430.0, 'C': 86.0, 'D': 297.0, 'E': 276.0, 'F': 832.0, 'G': 1521.0, 'H': 1731.0, 'I': 125.0, 'K': 914.0, 'L': 644.0, 'M': 605.0, 'N': 1922.0, 'P': 1024.0, 'Q': 492.0, 'R': 1264.0, 'S': 2239.0, 'T': 293.0, 'V': 195.0, 'W': 919.0, 'Y': 774.0}
{'A': 152.0, 'C