In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
aa531 = pd.read_excel("../ubiquitination/AA531properties.xlsx")

In [3]:
aa531_num = aa531.shape[1]-1
aa531_dict = {}
for i in range(aa531.shape[0]):
    aa531_dict[aa531.iloc[i,0]] = aa531.iloc[i,1:].values

In [4]:
def aa531_multi_scale(seq_data):
    
    def semi_variance(data, h):
        n = len(data)
        semi_var = 0
        for i in range(h, n):
            semi_var += (data[i] - data[i - h])**2
        semi_var /= (n - h)
        semi_var /= 2
        return semi_var
    
    rows = len(seq_data)
    win_size = len(seq_data[0])
    
    result_matrix = np.zeros((rows,(win_size//2)*aa531_num))

    for k in range(rows):
        seq = seq_data[k]
        
        result_list = []
        for i in range(aa531_num):
            aa531_values = [aa531_dict[s][i] for s in seq]
            h_list = []
            for h in range(1, len(aa531_values)//2+1):
                semi_var_result = semi_variance(aa531_values, h)
                h_list.append(semi_var_result)
            result_list.extend(h_list)
                
        result_matrix[k, :] = result_list
    
    result_matrix = np.round(result_matrix, decimals=5)
    
    return result_matrix

In [15]:
#打开txt文件并读取内容
with open(r"D:\study\paper\ubiquitination\Arab\dataset\win31\train_pos_win31.txt", 'r') as f1:
    pos_lines = f1.readlines()
    pos_stripped_lines = [pl.strip() for pl in pos_lines]

with open(r"D:\study\paper\ubiquitination\Arab\dataset\win31\train_neg_win31.txt", 'r') as f2:
    neg_lines = f2.readlines()
    neg_stripped_lines = [nl.strip() for nl in neg_lines]

In [16]:
pos_feature_matrix = aa531_multi_scale(pos_stripped_lines)
neg_feature_matrix = aa531_multi_scale(neg_stripped_lines)

In [17]:
feature_matrix = np.vstack((pos_feature_matrix,neg_feature_matrix))

In [18]:
feature_matrix.shape

(6129, 7965)

In [19]:
label = np.vstack((np.ones((len(pos_stripped_lines),1)),np.zeros((len(neg_stripped_lines),1))))

In [20]:
data_matrix = np.hstack((label,feature_matrix))

In [21]:
col_name = ['AMS_'+str(i+1) for i in range(feature_matrix.shape[1])]

In [22]:
col_name.insert(0,'label')

In [23]:
df = pd.DataFrame(data_matrix,columns=col_name)
df

Unnamed: 0,label,AMS_1,AMS_2,AMS_3,AMS_4,AMS_5,AMS_6,AMS_7,AMS_8,AMS_9,...,AMS_7956,AMS_7957,AMS_7958,AMS_7959,AMS_7960,AMS_7961,AMS_7962,AMS_7963,AMS_7964,AMS_7965
0,1.0,0.06126,0.06466,0.05073,0.06480,0.04788,0.06989,0.05954,0.05782,0.06594,...,11.40730,8.13541,9.93362,3.60449,9.61719,5.31248,9.33504,8.52159,12.20432,9.29366
1,1.0,0.04563,0.04515,0.05362,0.02982,0.04794,0.04126,0.04468,0.03450,0.04328,...,8.70606,5.68242,11.42234,9.43688,3.75247,7.19509,10.37895,6.77693,4.50367,10.55289
2,1.0,0.04445,0.04742,0.03676,0.03979,0.04814,0.05289,0.04676,0.04083,0.04719,...,3.31518,3.29350,2.91855,4.51640,3.30360,4.23722,5.44171,4.10251,3.29475,5.23880
3,1.0,0.05331,0.04368,0.05121,0.04894,0.03812,0.04656,0.03619,0.04390,0.04361,...,10.03882,5.65642,8.80563,9.55576,8.73790,4.98796,7.06440,8.93551,8.11489,9.34793
4,1.0,0.01016,0.01269,0.01049,0.01238,0.01478,0.01337,0.01498,0.01589,0.01438,...,8.93461,8.36694,7.40115,5.53359,6.07823,6.54040,8.27754,6.64534,4.87812,6.09876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6124,0.0,0.04166,0.05901,0.06841,0.06601,0.05835,0.08295,0.06554,0.06268,0.06190,...,4.20904,3.33203,5.16629,5.45123,6.38820,6.03742,6.71424,7.64408,6.03536,4.46598
6125,0.0,0.08698,0.06695,0.09292,0.07023,0.08568,0.04903,0.07816,0.05563,0.06307,...,9.34069,9.33293,9.10250,6.85745,8.28448,11.53622,10.52811,8.73322,6.17598,9.03353
6126,0.0,0.04287,0.04091,0.03922,0.05193,0.04756,0.04365,0.03846,0.03710,0.05153,...,10.38570,9.42426,11.81239,9.19394,9.02665,12.06795,5.01250,8.75656,13.68515,6.95713
6127,0.0,0.03586,0.04471,0.03059,0.04157,0.04741,0.05072,0.04144,0.03151,0.05221,...,4.68124,3.84195,4.22869,5.25303,3.93358,5.44321,5.24966,5.85219,6.33270,6.02017


In [24]:
df.to_csv(r"D:\study\paper\ubiquitination\Arab\feature_extraction\win31_new\train\train_ams.csv",index=False)