In [27]:
import numpy as np
import pandas as pd
from collections import Counter
from itertools import product

In [28]:
def extract_dipeptide_frequency(seq_data):
    aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY'
    num_aa = len(aa_alphabet)
    
    dipeptides = [''.join(pair) for pair in product(aa_alphabet, repeat=2)]
    
    result_matrix = np.zeros((len(seq_data),num_aa*num_aa))
    for i in range(len(seq_data)):
        seq = seq_data[i]
        # 初始化频率字典，确保所有二肽组合都有一个初始频率值
        frequency = {dipeptide: 0 for dipeptide in dipeptides}

        # 使用Counter统计每个二肽的频率
        dipeptide_counts = Counter([seq[j:j+2] for j in range(len(seq)-1) if 'X' not in seq[j:j+2]])

        # 更新频率字典

        for dipeptide, count in dipeptide_counts.items():
            frequency[dipeptide] = count
        all_values = list(frequency.values())
        result_matrix[i,:] = all_values
    
    return result_matrix

In [39]:
#打开txt文件并读取内容
with open(r"D:\study\paper\ubiquitination\Arab\dataset\win31\test_pos_win31.txt", 'r') as f1:
    pos_lines = f1.readlines()
    pos_stripped_lines = [pl.strip() for pl in pos_lines]

with open(r"D:\study\paper\ubiquitination\Arab\dataset\win31\test_neg_win31.txt", 'r') as f2:
    neg_lines = f2.readlines()
    neg_stripped_lines = [nl.strip() for nl in neg_lines]

In [40]:
pos_feature_matrix = extract_dipeptide_frequency(pos_stripped_lines)
neg_feature_matrix = extract_dipeptide_frequency(neg_stripped_lines)

In [41]:
feature_matrix = np.vstack((pos_feature_matrix,neg_feature_matrix))

In [42]:
feature_matrix.shape

(2044, 400)

In [43]:
label = np.vstack((np.ones((len(pos_stripped_lines),1)),np.zeros((len(neg_stripped_lines),1))))

In [44]:
data_matrix = np.hstack((label,feature_matrix))

In [45]:
col_name = ['Dipeptide_'+str(i+1) for i in range(feature_matrix.shape[1])]

In [46]:
col_name.insert(0,'label')

In [47]:
df = pd.DataFrame(data_matrix,columns=col_name)
df

Unnamed: 0,label,Dipeptide_1,Dipeptide_2,Dipeptide_3,Dipeptide_4,Dipeptide_5,Dipeptide_6,Dipeptide_7,Dipeptide_8,Dipeptide_9,...,Dipeptide_391,Dipeptide_392,Dipeptide_393,Dipeptide_394,Dipeptide_395,Dipeptide_396,Dipeptide_397,Dipeptide_398,Dipeptide_399,Dipeptide_400
0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2042,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df.to_csv(r"D:\study\paper\ubiquitination\Arab\feature_extraction\win31_new\test\test_Dipeptide.csv",index=False)