In [1]:
import re
import csv
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# path
path_data_dir = os.path.dirname(os.path.dirname(os.getcwd()))

path_negative_data = path_data_dir + r'\row_data\non-thermophilic.txt'
path_positive_data = path_data_dir +  r'\row_data\thermophilic.txt'

train_data_path_part1 = path_data_dir + r'\new_data\new_features\feature_select_'
train_data_path_part2 = '_'
train_data_path_part3 = r'_gap.csv'

In [3]:
# 将正类和负类数据抽取成list
def extract_data(path):
    lines_list = []
    lines = ''
    with open(path, 'r') as f:
        for line in f.readlines():
            if '>' not in line:
                line = line.replace('\n','').replace('\t','').replace('\r','').strip()
                lines = lines + line
            else:
                lines_list.append(lines)
                lines = ''
    lines_list.remove('')
    lines_list.append(lines)
    return lines_list

In [4]:
# 标准氨基酸 ACDEFGHIKLMNPQRSTVWY
# 当特征为连续k个时，创建k的n次方个特征返回
def create_features(new_features,k,if_gap=False):
    if if_gap==False:
        row_features = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
    else:
        row_features = ['*']
    length = len(new_features)
    features = []
    for i in range(length):
        for j in row_features:
            features.append(new_features[i] + j)
    if k > 1:
        k -= 1
        features = create_features(features,k)
    return features

In [5]:
def get_ratio(feature, data):
    # 获取某一个特征在某一类（正类或负类）中的比率
    length = len(data)
    freq = 0
    for d in data:
        pattern = r''
        for s in feature:
            if s!='*':
                pattern += s
            else:
                pattern += '[A-Z]'
        flag = re.search(pattern, d)
        if flag != None:
            freq += 1
    return freq/length

In [6]:
def get_class_ratio(features, data):
    # 获取一类在所有样本中的比率
    features_ratio = {}
    for feat in features:
        features_ratio[feat] = get_ratio(feat, data)
    return features_ratio

In [7]:
# 抽取正负样本数据
positive_data_row = extract_data(path_positive_data)
negative_data = extract_data(path_negative_data)

positive_data_length = 100

In [8]:
len(positive_data_row)

915

In [9]:
positive_data = []
for dat in positive_data_row:
    if len(dat)>positive_data_length:
        positive_data.append(dat)

In [10]:
len(positive_data)

837

In [11]:
def get_features(psoitive_output):
    temp = []
    for i in psoitive_output.columns:
        if psoitive_output.loc['psoitive_ratio',i] <0.95:
            temp.append(i)
    return temp

In [12]:
number = 0

In [13]:
def get_tree_features(features, gap, depth=0, stop_depth=27):
    global number
    print(depth)
    depth += 1
    if depth>stop_depth:  # 迭代终止条件，序列长度大于45
        return None
    if gap == False:
        row_features = create_features(features, 1, gap) # 生成特征集合
        #print(row_features)
        psoitive_ratio = get_class_ratio(row_features, positive_data) # 获取特征集的百分比
        psoitive_output = pd.DataFrame([psoitive_ratio],index = ['psoitive_ratio']) # 生成dataframe
        drop_feature = get_features(psoitive_output)  # 生成不满足条件的特征
        psoitive_output.drop(drop_feature, axis = 1, inplace = True) # 将不满足条件的特征去除
        if psoitive_output.empty:     # 迭代终止条件，该特征没有
            print('该序列特征为空')
            return None
        else:
            psoitive_output = psoitive_output.sort_values('psoitive_ratio',axis = 1,ascending=False)
            negative_ratio = get_class_ratio(psoitive_output.columns, negative_data)
            negative_output = pd.DataFrame([negative_ratio],index = ['negative_ratio'])
            output = pd.concat([psoitive_output,negative_output],axis = 0)
            #print(output)
            output = output.sort_values('negative_ratio',axis = 1)
            number += 1
            train_data_path_part = train_data_path_part1 + str(number) + train_data_path_part3
            if len(output.columns) > 100:
                output = output.iloc[:,:100]
                #print(output)
                output.to_csv(train_data_path_part, index = None)
            else:
                #print(output)
                output.to_csv(train_data_path_part, index = None)
            features = output.columns
            get_tree_features(features, False, depth, stop_depth)
            get_tree_features(features, True, depth, stop_depth)
            
    else:
        row_features = create_features(features, 1, gap)
        #print(row_features)
        get_tree_features(row_features, False, depth, stop_depth)
        get_tree_features(row_features, True, depth, stop_depth)

In [14]:
low = 1000
for i in positive_data:
    if len(i)<low:
        low = len(i)

In [15]:
get_tree_features([''],gap=False, depth=1, stop_depth = low)

1
2
该序列特征为空
2
3
该序列特征为空
3
4
该序列特征为空
4
5
该序列特征为空
5
6
该序列特征为空
6
7
该序列特征为空
7
8
该序列特征为空
8
9
该序列特征为空
9
10
该序列特征为空
10
11
该序列特征为空
11
12
该序列特征为空
12
13
该序列特征为空
13
14
该序列特征为空
14
15
该序列特征为空
15
16
该序列特征为空
16
17
该序列特征为空
17
18
该序列特征为空
18
19
该序列特征为空
19
20
该序列特征为空
20
21
该序列特征为空
21
22
该序列特征为空
22
23
该序列特征为空
23
24
该序列特征为空
24
25
该序列特征为空
25
26
该序列特征为空
26
27
该序列特征为空
27
28
该序列特征为空
28
29
该序列特征为空
29
30
该序列特征为空
30
31
该序列特征为空
31
32
该序列特征为空
32
33
该序列特征为空
33
34
该序列特征为空
34
35
该序列特征为空
35
36
该序列特征为空
36
37
该序列特征为空
37
38
该序列特征为空
38
39
该序列特征为空
39
40
该序列特征为空
40
41
该序列特征为空
41
42
该序列特征为空
42
43
该序列特征为空
43
44
该序列特征为空
44
45
该序列特征为空
45
46
该序列特征为空
46
47
该序列特征为空
47
48
该序列特征为空
48
49
该序列特征为空
49
50
该序列特征为空
50
51
该序列特征为空
51
52
该序列特征为空
52
53
该序列特征为空
53
54
该序列特征为空
54
55
该序列特征为空
55
56
该序列特征为空
56
57
该序列特征为空
57
58
该序列特征为空
58
59
该序列特征为空
59
60
该序列特征为空
60
61
该序列特征为空
61
62
该序列特征为空
62
63
该序列特征为空
63
64
该序列特征为空
64
65
该序列特征为空
65
66
该序列特征为空
66
67
该序列特征为空
67
68
该序列特征为空
68
69
该序列特征为空
69
70
该序列特征为空
70
71
该序列特征为空
71
72
该序列特征为空
72
73
该序列特征为空
73
74
该序列

In [16]:
len('Y*************************')

26

In [17]:
data_length = {}
for s in positive_data:
    length = len(s)
    data_length[length] = 0
for s in positive_data:
    length = len(s)
    data_length[length] += 1

In [18]:
data_length

{154: 3,
 834: 1,
 107: 1,
 120: 4,
 112: 3,
 686: 2,
 562: 2,
 247: 2,
 197: 5,
 189: 4,
 631: 1,
 546: 2,
 105: 5,
 458: 2,
 444: 1,
 404: 2,
 358: 2,
 316: 1,
 553: 1,
 349: 3,
 283: 1,
 256: 2,
 226: 1,
 183: 5,
 253: 3,
 721: 2,
 138: 6,
 461: 1,
 302: 3,
 211: 4,
 266: 4,
 242: 2,
 281: 2,
 295: 2,
 649: 2,
 605: 1,
 585: 1,
 469: 2,
 377: 3,
 314: 3,
 420: 2,
 561: 1,
 1263: 1,
 775: 1,
 352: 3,
 720: 1,
 178: 3,
 434: 2,
 426: 2,
 303: 2,
 315: 1,
 249: 9,
 208: 4,
 166: 3,
 219: 5,
 1398: 1,
 396: 3,
 331: 1,
 817: 1,
 347: 2,
 225: 3,
 882: 2,
 162: 4,
 479: 1,
 852: 1,
 203: 3,
 127: 3,
 300: 1,
 195: 4,
 207: 4,
 381: 2,
 248: 6,
 340: 2,
 103: 2,
 755: 1,
 895: 1,
 1853: 1,
 410: 2,
 185: 2,
 334: 3,
 477: 2,
 343: 3,
 814: 1,
 900: 1,
 887: 1,
 741: 2,
 1077: 1,
 837: 1,
 113: 2,
 101: 4,
 104: 6,
 213: 4,
 337: 1,
 472: 1,
 156: 5,
 245: 5,
 1156: 1,
 109: 1,
 429: 3,
 559: 1,
 592: 2,
 216: 5,
 230: 3,
 102: 4,
 222: 3,
 582: 1,
 180: 3,
 389: 1,
 235: 4,
 294: 4,
 220:

In [19]:
pp =data_length.keys()

In [20]:
len(pp)

435

In [21]:
aaa = list(pp).sort()

In [22]:
sorted(list(pp))

[101,
 102,
 103,
 104,
 105,
 106,
 107,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 120,
 121,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 154,
 155,
 156,
 157,
 159,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 191,
 192,
 193,
 195,
 196,
 197,
 198,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 257,
 258,
 259,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 273,
 274,
 275,
 276,
 278,
 280,
 281,
 282,
 283,
 284,
 286,
 288,
 289,
 290